File size: 102,361 Bytes
9e8107c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 |
2023-07-26 14:15:51,620 ----------------------------------------------------------------------------------------------------
2023-07-26 14:15:51,621 Model: "SequenceTagger(
(embeddings): TransformerWordEmbeddings(
(model): XLMRobertaModel(
(embeddings): XLMRobertaEmbeddings(
(word_embeddings): Embedding(250003, 768)
(position_embeddings): Embedding(514, 768, padding_idx=1)
(token_type_embeddings): Embedding(1, 768)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(encoder): XLMRobertaEncoder(
(layer): ModuleList(
(0): XLMRobertaLayer(
(attention): XLMRobertaAttention(
(self): XLMRobertaSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): XLMRobertaSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): XLMRobertaIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): XLMRobertaOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(1): XLMRobertaLayer(
(attention): XLMRobertaAttention(
(self): XLMRobertaSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): XLMRobertaSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): XLMRobertaIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): XLMRobertaOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(2): XLMRobertaLayer(
(attention): XLMRobertaAttention(
(self): XLMRobertaSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): XLMRobertaSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): XLMRobertaIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): XLMRobertaOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(3): XLMRobertaLayer(
(attention): XLMRobertaAttention(
(self): XLMRobertaSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): XLMRobertaSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): XLMRobertaIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): XLMRobertaOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(4): XLMRobertaLayer(
(attention): XLMRobertaAttention(
(self): XLMRobertaSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): XLMRobertaSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): XLMRobertaIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): XLMRobertaOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(5): XLMRobertaLayer(
(attention): XLMRobertaAttention(
(self): XLMRobertaSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): XLMRobertaSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): XLMRobertaIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): XLMRobertaOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(6): XLMRobertaLayer(
(attention): XLMRobertaAttention(
(self): XLMRobertaSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): XLMRobertaSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): XLMRobertaIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): XLMRobertaOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(7): XLMRobertaLayer(
(attention): XLMRobertaAttention(
(self): XLMRobertaSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): XLMRobertaSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): XLMRobertaIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): XLMRobertaOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(8): XLMRobertaLayer(
(attention): XLMRobertaAttention(
(self): XLMRobertaSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): XLMRobertaSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): XLMRobertaIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): XLMRobertaOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(9): XLMRobertaLayer(
(attention): XLMRobertaAttention(
(self): XLMRobertaSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): XLMRobertaSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): XLMRobertaIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): XLMRobertaOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(10): XLMRobertaLayer(
(attention): XLMRobertaAttention(
(self): XLMRobertaSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): XLMRobertaSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): XLMRobertaIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): XLMRobertaOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(11): XLMRobertaLayer(
(attention): XLMRobertaAttention(
(self): XLMRobertaSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): XLMRobertaSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): XLMRobertaIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): XLMRobertaOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
)
(pooler): XLMRobertaPooler(
(dense): Linear(in_features=768, out_features=768, bias=True)
(activation): Tanh()
)
)
)
(word_dropout): WordDropout(p=0.05)
(locked_dropout): LockedDropout(p=0.5)
(linear): Linear(in_features=768, out_features=158, bias=True)
(loss_function): ViterbiLoss()
(crf): CRF()
)"
2023-07-26 14:15:51,622 ----------------------------------------------------------------------------------------------------
2023-07-26 14:15:51,622 Corpus: "Corpus: 7767 train + 409 dev + 0 test sentences"
2023-07-26 14:15:51,622 ----------------------------------------------------------------------------------------------------
2023-07-26 14:15:51,622 Parameters:
2023-07-26 14:15:51,622 - learning_rate: "0.000050"
2023-07-26 14:15:51,622 - mini_batch_size: "32"
2023-07-26 14:15:51,622 - patience: "3"
2023-07-26 14:15:51,622 - anneal_factor: "0.5"
2023-07-26 14:15:51,622 - max_epochs: "50"
2023-07-26 14:15:51,622 - shuffle: "True"
2023-07-26 14:15:51,622 - train_with_dev: "False"
2023-07-26 14:15:51,622 - batch_growth_annealing: "False"
2023-07-26 14:15:51,622 ----------------------------------------------------------------------------------------------------
2023-07-26 14:15:51,622 Model training base path: "/scratch/skulick/ppchy-11-pos/xlmb-ck05-yid1/split_final/train"
2023-07-26 14:15:51,623 ----------------------------------------------------------------------------------------------------
2023-07-26 14:15:51,623 Device: cuda:0
2023-07-26 14:15:51,623 ----------------------------------------------------------------------------------------------------
2023-07-26 14:15:51,623 Embeddings storage mode: none
2023-07-26 14:15:51,623 ----------------------------------------------------------------------------------------------------
2023-07-26 14:16:43,612 epoch 1 - iter 24/243 - loss 5.78182875 - time (sec): 51.99 - samples/sec: 143.68 - lr: 0.000001
2023-07-26 14:17:33,864 epoch 1 - iter 48/243 - loss 5.72562134 - time (sec): 102.24 - samples/sec: 145.85 - lr: 0.000002
2023-07-26 14:18:18,655 epoch 1 - iter 72/243 - loss 5.61608578 - time (sec): 147.03 - samples/sec: 153.94 - lr: 0.000003
2023-07-26 14:19:03,423 epoch 1 - iter 96/243 - loss 5.47788448 - time (sec): 191.80 - samples/sec: 157.84 - lr: 0.000004
2023-07-26 14:19:48,078 epoch 1 - iter 120/243 - loss 5.26991238 - time (sec): 236.45 - samples/sec: 160.79 - lr: 0.000005
2023-07-26 14:20:32,722 epoch 1 - iter 144/243 - loss 5.07404788 - time (sec): 281.10 - samples/sec: 162.52 - lr: 0.000006
2023-07-26 14:21:18,887 epoch 1 - iter 168/243 - loss 4.86972776 - time (sec): 327.26 - samples/sec: 164.17 - lr: 0.000007
2023-07-26 14:22:07,928 epoch 1 - iter 192/243 - loss 4.66109804 - time (sec): 376.31 - samples/sec: 164.20 - lr: 0.000008
2023-07-26 14:22:56,658 epoch 1 - iter 216/243 - loss 4.44788101 - time (sec): 425.04 - samples/sec: 163.40 - lr: 0.000009
2023-07-26 14:23:45,642 epoch 1 - iter 240/243 - loss 4.23693631 - time (sec): 474.02 - samples/sec: 163.81 - lr: 0.000010
2023-07-26 14:23:51,273 ----------------------------------------------------------------------------------------------------
2023-07-26 14:23:51,273 EPOCH 1 done: loss 4.2141 - lr 0.000010
2023-07-26 14:23:53,076 Evaluating as a multi-label problem: False
2023-07-26 14:23:53,119 DEV : loss 1.6606154441833496 - f1-score (micro avg) 0.7017
2023-07-26 14:23:53,129 saving best model
2023-07-26 14:23:55,463 ----------------------------------------------------------------------------------------------------
2023-07-26 14:24:39,905 epoch 2 - iter 24/243 - loss 1.93643008 - time (sec): 44.44 - samples/sec: 172.43 - lr: 0.000011
2023-07-26 14:25:24,584 epoch 2 - iter 48/243 - loss 1.80170810 - time (sec): 89.12 - samples/sec: 174.28 - lr: 0.000012
2023-07-26 14:26:09,249 epoch 2 - iter 72/243 - loss 1.68553020 - time (sec): 133.79 - samples/sec: 174.97 - lr: 0.000013
2023-07-26 14:26:53,935 epoch 2 - iter 96/243 - loss 1.59018149 - time (sec): 178.47 - samples/sec: 175.90 - lr: 0.000014
2023-07-26 14:27:38,417 epoch 2 - iter 120/243 - loss 1.51168641 - time (sec): 222.95 - samples/sec: 176.47 - lr: 0.000015
2023-07-26 14:28:23,238 epoch 2 - iter 144/243 - loss 1.44496232 - time (sec): 267.77 - samples/sec: 176.24 - lr: 0.000016
2023-07-26 14:29:07,485 epoch 2 - iter 168/243 - loss 1.38343183 - time (sec): 312.02 - samples/sec: 175.74 - lr: 0.000017
2023-07-26 14:29:51,869 epoch 2 - iter 192/243 - loss 1.32848150 - time (sec): 356.41 - samples/sec: 175.52 - lr: 0.000018
2023-07-26 14:30:36,135 epoch 2 - iter 216/243 - loss 1.28678633 - time (sec): 400.67 - samples/sec: 174.52 - lr: 0.000019
2023-07-26 14:31:20,592 epoch 2 - iter 240/243 - loss 1.24063251 - time (sec): 445.13 - samples/sec: 174.61 - lr: 0.000020
2023-07-26 14:31:25,683 ----------------------------------------------------------------------------------------------------
2023-07-26 14:31:25,683 EPOCH 2 done: loss 1.2362 - lr 0.000020
2023-07-26 14:31:27,442 Evaluating as a multi-label problem: False
2023-07-26 14:31:27,484 DEV : loss 0.4555579721927643 - f1-score (micro avg) 0.9132
2023-07-26 14:31:27,494 saving best model
2023-07-26 14:31:30,740 ----------------------------------------------------------------------------------------------------
2023-07-26 14:32:15,374 epoch 3 - iter 24/243 - loss 0.80478615 - time (sec): 44.63 - samples/sec: 181.44 - lr: 0.000021
2023-07-26 14:32:59,817 epoch 3 - iter 48/243 - loss 0.76412570 - time (sec): 89.08 - samples/sec: 179.04 - lr: 0.000022
2023-07-26 14:33:44,386 epoch 3 - iter 72/243 - loss 0.74620943 - time (sec): 133.64 - samples/sec: 176.74 - lr: 0.000023
2023-07-26 14:34:28,788 epoch 3 - iter 96/243 - loss 0.72917808 - time (sec): 178.05 - samples/sec: 175.92 - lr: 0.000024
2023-07-26 14:35:13,386 epoch 3 - iter 120/243 - loss 0.72089137 - time (sec): 222.64 - samples/sec: 176.15 - lr: 0.000025
2023-07-26 14:35:57,934 epoch 3 - iter 144/243 - loss 0.70075087 - time (sec): 267.19 - samples/sec: 175.65 - lr: 0.000026
2023-07-26 14:36:42,264 epoch 3 - iter 168/243 - loss 0.68433087 - time (sec): 311.52 - samples/sec: 174.95 - lr: 0.000027
2023-07-26 14:37:26,778 epoch 3 - iter 192/243 - loss 0.67039041 - time (sec): 356.04 - samples/sec: 175.14 - lr: 0.000028
2023-07-26 14:38:11,135 epoch 3 - iter 216/243 - loss 0.66061953 - time (sec): 400.39 - samples/sec: 175.13 - lr: 0.000029
2023-07-26 14:38:55,563 epoch 3 - iter 240/243 - loss 0.65094446 - time (sec): 444.82 - samples/sec: 174.77 - lr: 0.000030
2023-07-26 14:39:00,596 ----------------------------------------------------------------------------------------------------
2023-07-26 14:39:00,596 EPOCH 3 done: loss 0.6499 - lr 0.000030
2023-07-26 14:39:02,334 Evaluating as a multi-label problem: False
2023-07-26 14:39:02,376 DEV : loss 0.247285857796669 - f1-score (micro avg) 0.9518
2023-07-26 14:39:02,385 saving best model
2023-07-26 14:39:05,730 ----------------------------------------------------------------------------------------------------
2023-07-26 14:39:50,093 epoch 4 - iter 24/243 - loss 0.55472967 - time (sec): 44.36 - samples/sec: 176.66 - lr: 0.000031
2023-07-26 14:40:34,562 epoch 4 - iter 48/243 - loss 0.52360637 - time (sec): 88.83 - samples/sec: 175.54 - lr: 0.000032
2023-07-26 14:41:19,307 epoch 4 - iter 72/243 - loss 0.51655667 - time (sec): 133.58 - samples/sec: 174.54 - lr: 0.000033
2023-07-26 14:42:05,307 epoch 4 - iter 96/243 - loss 0.51891961 - time (sec): 179.58 - samples/sec: 173.86 - lr: 0.000034
2023-07-26 14:42:54,480 epoch 4 - iter 120/243 - loss 0.50631556 - time (sec): 228.75 - samples/sec: 171.40 - lr: 0.000035
2023-07-26 14:43:43,446 epoch 4 - iter 144/243 - loss 0.50459545 - time (sec): 277.72 - samples/sec: 168.74 - lr: 0.000036
2023-07-26 14:44:32,519 epoch 4 - iter 168/243 - loss 0.50045519 - time (sec): 326.79 - samples/sec: 167.35 - lr: 0.000037
2023-07-26 14:45:21,599 epoch 4 - iter 192/243 - loss 0.49446570 - time (sec): 375.87 - samples/sec: 166.24 - lr: 0.000038
2023-07-26 14:46:10,542 epoch 4 - iter 216/243 - loss 0.49218271 - time (sec): 424.81 - samples/sec: 165.38 - lr: 0.000039
2023-07-26 14:46:59,284 epoch 4 - iter 240/243 - loss 0.49159525 - time (sec): 473.55 - samples/sec: 164.09 - lr: 0.000040
2023-07-26 14:47:04,893 ----------------------------------------------------------------------------------------------------
2023-07-26 14:47:04,893 EPOCH 4 done: loss 0.4917 - lr 0.000040
2023-07-26 14:47:06,684 Evaluating as a multi-label problem: False
2023-07-26 14:47:06,726 DEV : loss 0.18006576597690582 - f1-score (micro avg) 0.9648
2023-07-26 14:47:06,736 saving best model
2023-07-26 14:47:10,014 ----------------------------------------------------------------------------------------------------
2023-07-26 14:47:54,932 epoch 5 - iter 24/243 - loss 0.45058356 - time (sec): 44.92 - samples/sec: 173.25 - lr: 0.000041
2023-07-26 14:48:41,950 epoch 5 - iter 48/243 - loss 0.43329992 - time (sec): 91.94 - samples/sec: 169.29 - lr: 0.000042
2023-07-26 14:49:33,377 epoch 5 - iter 72/243 - loss 0.43373609 - time (sec): 143.36 - samples/sec: 163.90 - lr: 0.000043
2023-07-26 14:50:24,178 epoch 5 - iter 96/243 - loss 0.43090189 - time (sec): 194.16 - samples/sec: 160.68 - lr: 0.000044
2023-07-26 14:51:14,713 epoch 5 - iter 120/243 - loss 0.42730629 - time (sec): 244.70 - samples/sec: 158.36 - lr: 0.000045
2023-07-26 14:52:05,519 epoch 5 - iter 144/243 - loss 0.42510607 - time (sec): 295.50 - samples/sec: 157.71 - lr: 0.000046
2023-07-26 14:52:56,269 epoch 5 - iter 168/243 - loss 0.42354677 - time (sec): 346.25 - samples/sec: 157.30 - lr: 0.000047
2023-07-26 14:53:45,024 epoch 5 - iter 192/243 - loss 0.42562343 - time (sec): 395.01 - samples/sec: 157.75 - lr: 0.000048
2023-07-26 14:54:29,614 epoch 5 - iter 216/243 - loss 0.42329549 - time (sec): 439.60 - samples/sec: 159.49 - lr: 0.000049
2023-07-26 14:55:14,101 epoch 5 - iter 240/243 - loss 0.42313631 - time (sec): 484.09 - samples/sec: 160.63 - lr: 0.000050
2023-07-26 14:55:19,182 ----------------------------------------------------------------------------------------------------
2023-07-26 14:55:19,183 EPOCH 5 done: loss 0.4224 - lr 0.000050
2023-07-26 14:55:20,964 Evaluating as a multi-label problem: False
2023-07-26 14:55:21,010 DEV : loss 0.15854212641716003 - f1-score (micro avg) 0.9715
2023-07-26 14:55:21,021 saving best model
2023-07-26 14:55:24,373 ----------------------------------------------------------------------------------------------------
2023-07-26 14:56:09,000 epoch 6 - iter 24/243 - loss 0.38322411 - time (sec): 44.63 - samples/sec: 170.24 - lr: 0.000050
2023-07-26 14:56:53,917 epoch 6 - iter 48/243 - loss 0.38879490 - time (sec): 89.54 - samples/sec: 173.84 - lr: 0.000050
2023-07-26 14:57:38,715 epoch 6 - iter 72/243 - loss 0.39501775 - time (sec): 134.34 - samples/sec: 173.59 - lr: 0.000050
2023-07-26 14:58:23,414 epoch 6 - iter 96/243 - loss 0.39125526 - time (sec): 179.04 - samples/sec: 172.72 - lr: 0.000050
2023-07-26 14:59:08,294 epoch 6 - iter 120/243 - loss 0.38810381 - time (sec): 223.92 - samples/sec: 173.39 - lr: 0.000049
2023-07-26 14:59:53,048 epoch 6 - iter 144/243 - loss 0.38859919 - time (sec): 268.67 - samples/sec: 173.20 - lr: 0.000049
2023-07-26 15:00:37,709 epoch 6 - iter 168/243 - loss 0.39183603 - time (sec): 313.34 - samples/sec: 172.54 - lr: 0.000049
2023-07-26 15:01:22,620 epoch 6 - iter 192/243 - loss 0.39172498 - time (sec): 358.25 - samples/sec: 173.10 - lr: 0.000049
2023-07-26 15:02:07,658 epoch 6 - iter 216/243 - loss 0.38755663 - time (sec): 403.28 - samples/sec: 173.50 - lr: 0.000049
2023-07-26 15:02:52,481 epoch 6 - iter 240/243 - loss 0.38859503 - time (sec): 448.11 - samples/sec: 173.42 - lr: 0.000049
2023-07-26 15:02:57,605 ----------------------------------------------------------------------------------------------------
2023-07-26 15:02:57,605 EPOCH 6 done: loss 0.3889 - lr 0.000049
2023-07-26 15:02:59,359 Evaluating as a multi-label problem: False
2023-07-26 15:02:59,401 DEV : loss 0.1478930115699768 - f1-score (micro avg) 0.9729
2023-07-26 15:02:59,411 saving best model
2023-07-26 15:03:02,642 ----------------------------------------------------------------------------------------------------
2023-07-26 15:03:47,204 epoch 7 - iter 24/243 - loss 0.37119833 - time (sec): 44.56 - samples/sec: 170.57 - lr: 0.000049
2023-07-26 15:04:32,257 epoch 7 - iter 48/243 - loss 0.34925497 - time (sec): 89.61 - samples/sec: 170.90 - lr: 0.000049
2023-07-26 15:05:17,152 epoch 7 - iter 72/243 - loss 0.36339135 - time (sec): 134.51 - samples/sec: 170.74 - lr: 0.000049
2023-07-26 15:06:02,168 epoch 7 - iter 96/243 - loss 0.36053250 - time (sec): 179.53 - samples/sec: 172.30 - lr: 0.000048
2023-07-26 15:06:47,283 epoch 7 - iter 120/243 - loss 0.36487615 - time (sec): 224.64 - samples/sec: 173.25 - lr: 0.000048
2023-07-26 15:07:32,276 epoch 7 - iter 144/243 - loss 0.36319947 - time (sec): 269.63 - samples/sec: 173.36 - lr: 0.000048
2023-07-26 15:08:17,184 epoch 7 - iter 168/243 - loss 0.36321272 - time (sec): 314.54 - samples/sec: 173.50 - lr: 0.000048
2023-07-26 15:09:02,085 epoch 7 - iter 192/243 - loss 0.36447693 - time (sec): 359.44 - samples/sec: 173.23 - lr: 0.000048
2023-07-26 15:09:51,228 epoch 7 - iter 216/243 - loss 0.36744951 - time (sec): 408.59 - samples/sec: 171.35 - lr: 0.000048
2023-07-26 15:10:40,287 epoch 7 - iter 240/243 - loss 0.36634157 - time (sec): 457.64 - samples/sec: 169.91 - lr: 0.000048
2023-07-26 15:10:45,862 ----------------------------------------------------------------------------------------------------
2023-07-26 15:10:45,863 EPOCH 7 done: loss 0.3670 - lr 0.000048
2023-07-26 15:10:47,681 Evaluating as a multi-label problem: False
2023-07-26 15:10:47,726 DEV : loss 0.14240729808807373 - f1-score (micro avg) 0.9717
2023-07-26 15:10:47,736 ----------------------------------------------------------------------------------------------------
2023-07-26 15:11:32,421 epoch 8 - iter 24/243 - loss 0.35991738 - time (sec): 44.68 - samples/sec: 171.16 - lr: 0.000048
2023-07-26 15:12:16,856 epoch 8 - iter 48/243 - loss 0.34897131 - time (sec): 89.12 - samples/sec: 171.01 - lr: 0.000048
2023-07-26 15:13:01,243 epoch 8 - iter 72/243 - loss 0.34258107 - time (sec): 133.51 - samples/sec: 171.82 - lr: 0.000047
2023-07-26 15:13:45,557 epoch 8 - iter 96/243 - loss 0.34457191 - time (sec): 177.82 - samples/sec: 171.15 - lr: 0.000047
2023-07-26 15:14:33,081 epoch 8 - iter 120/243 - loss 0.34507195 - time (sec): 225.34 - samples/sec: 168.78 - lr: 0.000047
2023-07-26 15:15:23,807 epoch 8 - iter 144/243 - loss 0.34828898 - time (sec): 276.07 - samples/sec: 167.52 - lr: 0.000047
2023-07-26 15:16:16,673 epoch 8 - iter 168/243 - loss 0.34938445 - time (sec): 328.94 - samples/sec: 163.83 - lr: 0.000047
2023-07-26 15:17:08,647 epoch 8 - iter 192/243 - loss 0.34862273 - time (sec): 380.91 - samples/sec: 162.58 - lr: 0.000047
2023-07-26 15:17:59,292 epoch 8 - iter 216/243 - loss 0.34977990 - time (sec): 431.56 - samples/sec: 161.50 - lr: 0.000047
2023-07-26 15:18:48,823 epoch 8 - iter 240/243 - loss 0.34875804 - time (sec): 481.09 - samples/sec: 161.18 - lr: 0.000047
2023-07-26 15:18:54,694 ----------------------------------------------------------------------------------------------------
2023-07-26 15:18:54,694 EPOCH 8 done: loss 0.3496 - lr 0.000047
2023-07-26 15:18:56,484 Evaluating as a multi-label problem: False
2023-07-26 15:18:56,526 DEV : loss 0.13401205837726593 - f1-score (micro avg) 0.9752
2023-07-26 15:18:56,536 saving best model
2023-07-26 15:18:59,887 ----------------------------------------------------------------------------------------------------
2023-07-26 15:19:45,875 epoch 9 - iter 24/243 - loss 0.33211277 - time (sec): 45.99 - samples/sec: 171.57 - lr: 0.000047
2023-07-26 15:20:33,843 epoch 9 - iter 48/243 - loss 0.33508629 - time (sec): 93.96 - samples/sec: 171.82 - lr: 0.000046
2023-07-26 15:21:26,038 epoch 9 - iter 72/243 - loss 0.32662985 - time (sec): 146.15 - samples/sec: 162.61 - lr: 0.000046
2023-07-26 15:22:17,368 epoch 9 - iter 96/243 - loss 0.32958645 - time (sec): 197.48 - samples/sec: 159.51 - lr: 0.000046
2023-07-26 15:23:08,277 epoch 9 - iter 120/243 - loss 0.32364185 - time (sec): 248.39 - samples/sec: 157.62 - lr: 0.000046
2023-07-26 15:23:59,015 epoch 9 - iter 144/243 - loss 0.32701429 - time (sec): 299.13 - samples/sec: 156.28 - lr: 0.000046
2023-07-26 15:24:49,851 epoch 9 - iter 168/243 - loss 0.33017416 - time (sec): 349.96 - samples/sec: 155.73 - lr: 0.000046
2023-07-26 15:25:40,830 epoch 9 - iter 192/243 - loss 0.33104299 - time (sec): 400.94 - samples/sec: 156.11 - lr: 0.000046
2023-07-26 15:26:30,943 epoch 9 - iter 216/243 - loss 0.33454509 - time (sec): 451.06 - samples/sec: 155.81 - lr: 0.000046
2023-07-26 15:27:20,164 epoch 9 - iter 240/243 - loss 0.33386278 - time (sec): 500.28 - samples/sec: 155.37 - lr: 0.000046
2023-07-26 15:27:25,781 ----------------------------------------------------------------------------------------------------
2023-07-26 15:27:25,782 EPOCH 9 done: loss 0.3329 - lr 0.000046
2023-07-26 15:27:27,595 Evaluating as a multi-label problem: False
2023-07-26 15:27:27,637 DEV : loss 0.14190562069416046 - f1-score (micro avg) 0.9764
2023-07-26 15:27:27,647 saving best model
2023-07-26 15:27:31,002 ----------------------------------------------------------------------------------------------------
2023-07-26 15:28:16,088 epoch 10 - iter 24/243 - loss 0.34002265 - time (sec): 45.09 - samples/sec: 170.28 - lr: 0.000045
2023-07-26 15:29:00,810 epoch 10 - iter 48/243 - loss 0.33540108 - time (sec): 89.81 - samples/sec: 172.64 - lr: 0.000045
2023-07-26 15:29:45,833 epoch 10 - iter 72/243 - loss 0.33399184 - time (sec): 134.83 - samples/sec: 173.50 - lr: 0.000045
2023-07-26 15:30:30,533 epoch 10 - iter 96/243 - loss 0.32469492 - time (sec): 179.53 - samples/sec: 173.83 - lr: 0.000045
2023-07-26 15:31:15,030 epoch 10 - iter 120/243 - loss 0.32910415 - time (sec): 224.03 - samples/sec: 173.44 - lr: 0.000045
2023-07-26 15:31:59,646 epoch 10 - iter 144/243 - loss 0.32899582 - time (sec): 268.64 - samples/sec: 173.64 - lr: 0.000045
2023-07-26 15:32:44,609 epoch 10 - iter 168/243 - loss 0.33093813 - time (sec): 313.61 - samples/sec: 174.48 - lr: 0.000045
2023-07-26 15:33:29,306 epoch 10 - iter 192/243 - loss 0.33208597 - time (sec): 358.30 - samples/sec: 173.78 - lr: 0.000045
2023-07-26 15:34:14,223 epoch 10 - iter 216/243 - loss 0.33175324 - time (sec): 403.22 - samples/sec: 174.07 - lr: 0.000045
2023-07-26 15:34:58,900 epoch 10 - iter 240/243 - loss 0.33262740 - time (sec): 447.90 - samples/sec: 173.56 - lr: 0.000044
2023-07-26 15:35:04,010 ----------------------------------------------------------------------------------------------------
2023-07-26 15:35:04,010 EPOCH 10 done: loss 0.3321 - lr 0.000044
2023-07-26 15:35:06,264 Evaluating as a multi-label problem: False
2023-07-26 15:35:06,306 DEV : loss 0.1481310874223709 - f1-score (micro avg) 0.9734
2023-07-26 15:35:06,316 ----------------------------------------------------------------------------------------------------
2023-07-26 15:35:51,091 epoch 11 - iter 24/243 - loss 0.33230355 - time (sec): 44.77 - samples/sec: 172.33 - lr: 0.000044
2023-07-26 15:36:36,125 epoch 11 - iter 48/243 - loss 0.32441123 - time (sec): 89.81 - samples/sec: 170.71 - lr: 0.000044
2023-07-26 15:37:25,279 epoch 11 - iter 72/243 - loss 0.32514673 - time (sec): 138.96 - samples/sec: 167.78 - lr: 0.000044
2023-07-26 15:38:10,516 epoch 11 - iter 96/243 - loss 0.32235685 - time (sec): 184.20 - samples/sec: 169.57 - lr: 0.000044
2023-07-26 15:38:58,115 epoch 11 - iter 120/243 - loss 0.31705674 - time (sec): 231.80 - samples/sec: 167.98 - lr: 0.000044
2023-07-26 15:39:45,447 epoch 11 - iter 144/243 - loss 0.31351156 - time (sec): 279.13 - samples/sec: 166.74 - lr: 0.000044
2023-07-26 15:40:32,843 epoch 11 - iter 168/243 - loss 0.31453443 - time (sec): 326.53 - samples/sec: 166.47 - lr: 0.000044
2023-07-26 15:41:20,505 epoch 11 - iter 192/243 - loss 0.32048855 - time (sec): 374.19 - samples/sec: 166.74 - lr: 0.000044
2023-07-26 15:42:08,594 epoch 11 - iter 216/243 - loss 0.31914298 - time (sec): 422.28 - samples/sec: 166.07 - lr: 0.000043
2023-07-26 15:42:58,015 epoch 11 - iter 240/243 - loss 0.31938530 - time (sec): 471.70 - samples/sec: 164.83 - lr: 0.000043
2023-07-26 15:43:03,640 ----------------------------------------------------------------------------------------------------
2023-07-26 15:43:03,640 EPOCH 11 done: loss 0.3201 - lr 0.000043
2023-07-26 15:43:05,491 Evaluating as a multi-label problem: False
2023-07-26 15:43:05,538 DEV : loss 0.16022486984729767 - f1-score (micro avg) 0.9744
2023-07-26 15:43:05,549 ----------------------------------------------------------------------------------------------------
2023-07-26 15:43:51,010 epoch 12 - iter 24/243 - loss 0.30634651 - time (sec): 45.46 - samples/sec: 169.22 - lr: 0.000043
2023-07-26 15:44:35,828 epoch 12 - iter 48/243 - loss 0.32055500 - time (sec): 90.28 - samples/sec: 169.40 - lr: 0.000043
2023-07-26 15:45:20,616 epoch 12 - iter 72/243 - loss 0.31591461 - time (sec): 135.07 - samples/sec: 170.20 - lr: 0.000043
2023-07-26 15:46:05,323 epoch 12 - iter 96/243 - loss 0.31720616 - time (sec): 179.77 - samples/sec: 171.25 - lr: 0.000043
2023-07-26 15:46:50,172 epoch 12 - iter 120/243 - loss 0.31877634 - time (sec): 224.62 - samples/sec: 172.25 - lr: 0.000043
2023-07-26 15:47:34,948 epoch 12 - iter 144/243 - loss 0.31817728 - time (sec): 269.40 - samples/sec: 172.60 - lr: 0.000043
2023-07-26 15:48:19,648 epoch 12 - iter 168/243 - loss 0.31409341 - time (sec): 314.10 - samples/sec: 173.20 - lr: 0.000043
2023-07-26 15:49:04,450 epoch 12 - iter 192/243 - loss 0.31475214 - time (sec): 358.90 - samples/sec: 172.72 - lr: 0.000042
2023-07-26 15:49:49,156 epoch 12 - iter 216/243 - loss 0.31439205 - time (sec): 403.61 - samples/sec: 173.13 - lr: 0.000042
2023-07-26 15:50:33,925 epoch 12 - iter 240/243 - loss 0.31462372 - time (sec): 448.38 - samples/sec: 173.38 - lr: 0.000042
2023-07-26 15:50:39,009 ----------------------------------------------------------------------------------------------------
2023-07-26 15:50:39,009 EPOCH 12 done: loss 0.3146 - lr 0.000042
2023-07-26 15:50:40,760 Evaluating as a multi-label problem: False
2023-07-26 15:50:40,803 DEV : loss 0.17038877308368683 - f1-score (micro avg) 0.9764
2023-07-26 15:50:40,813 ----------------------------------------------------------------------------------------------------
2023-07-26 15:51:25,228 epoch 13 - iter 24/243 - loss 0.30871471 - time (sec): 44.42 - samples/sec: 169.20 - lr: 0.000042
2023-07-26 15:52:09,735 epoch 13 - iter 48/243 - loss 0.30951571 - time (sec): 88.92 - samples/sec: 169.92 - lr: 0.000042
2023-07-26 15:52:54,713 epoch 13 - iter 72/243 - loss 0.30146253 - time (sec): 133.90 - samples/sec: 170.69 - lr: 0.000042
2023-07-26 15:53:39,688 epoch 13 - iter 96/243 - loss 0.29818491 - time (sec): 178.88 - samples/sec: 171.59 - lr: 0.000042
2023-07-26 15:54:24,347 epoch 13 - iter 120/243 - loss 0.29829818 - time (sec): 223.53 - samples/sec: 171.45 - lr: 0.000042
2023-07-26 15:55:09,312 epoch 13 - iter 144/243 - loss 0.31111593 - time (sec): 268.50 - samples/sec: 171.76 - lr: 0.000042
2023-07-26 15:55:54,240 epoch 13 - iter 168/243 - loss 0.31147702 - time (sec): 313.43 - samples/sec: 171.94 - lr: 0.000041
2023-07-26 15:56:39,090 epoch 13 - iter 192/243 - loss 0.30976085 - time (sec): 358.28 - samples/sec: 172.90 - lr: 0.000041
2023-07-26 15:57:24,278 epoch 13 - iter 216/243 - loss 0.30904370 - time (sec): 403.46 - samples/sec: 173.00 - lr: 0.000041
2023-07-26 15:58:09,133 epoch 13 - iter 240/243 - loss 0.30572837 - time (sec): 448.32 - samples/sec: 173.40 - lr: 0.000041
2023-07-26 15:58:14,202 ----------------------------------------------------------------------------------------------------
2023-07-26 15:58:14,202 EPOCH 13 done: loss 0.3056 - lr 0.000041
2023-07-26 15:58:15,991 Evaluating as a multi-label problem: False
2023-07-26 15:58:16,034 DEV : loss 0.16180633008480072 - f1-score (micro avg) 0.9766
2023-07-26 15:58:16,044 saving best model
2023-07-26 15:58:19,355 ----------------------------------------------------------------------------------------------------
2023-07-26 15:59:06,668 epoch 14 - iter 24/243 - loss 0.28577045 - time (sec): 47.31 - samples/sec: 164.50 - lr: 0.000041
2023-07-26 15:59:56,998 epoch 14 - iter 48/243 - loss 0.28369661 - time (sec): 97.64 - samples/sec: 158.27 - lr: 0.000041
2023-07-26 16:00:51,211 epoch 14 - iter 72/243 - loss 0.29071442 - time (sec): 151.86 - samples/sec: 153.53 - lr: 0.000041
2023-07-26 16:01:43,557 epoch 14 - iter 96/243 - loss 0.29219267 - time (sec): 204.20 - samples/sec: 154.01 - lr: 0.000041
2023-07-26 16:02:32,810 epoch 14 - iter 120/243 - loss 0.29452027 - time (sec): 253.45 - samples/sec: 154.42 - lr: 0.000041
2023-07-26 16:03:22,073 epoch 14 - iter 144/243 - loss 0.28860385 - time (sec): 302.72 - samples/sec: 154.60 - lr: 0.000040
2023-07-26 16:04:11,432 epoch 14 - iter 168/243 - loss 0.29040567 - time (sec): 352.08 - samples/sec: 155.10 - lr: 0.000040
2023-07-26 16:05:00,439 epoch 14 - iter 192/243 - loss 0.29057669 - time (sec): 401.08 - samples/sec: 155.56 - lr: 0.000040
2023-07-26 16:05:49,734 epoch 14 - iter 216/243 - loss 0.29351512 - time (sec): 450.38 - samples/sec: 155.77 - lr: 0.000040
2023-07-26 16:06:38,920 epoch 14 - iter 240/243 - loss 0.29475470 - time (sec): 499.56 - samples/sec: 155.77 - lr: 0.000040
2023-07-26 16:06:44,452 ----------------------------------------------------------------------------------------------------
2023-07-26 16:06:44,452 EPOCH 14 done: loss 0.2946 - lr 0.000040
2023-07-26 16:06:46,282 Evaluating as a multi-label problem: False
2023-07-26 16:06:46,328 DEV : loss 0.1961415857076645 - f1-score (micro avg) 0.9729
2023-07-26 16:06:46,338 ----------------------------------------------------------------------------------------------------
2023-07-26 16:07:31,298 epoch 15 - iter 24/243 - loss 0.32628632 - time (sec): 44.96 - samples/sec: 171.98 - lr: 0.000040
2023-07-26 16:08:21,094 epoch 15 - iter 48/243 - loss 0.30408958 - time (sec): 94.76 - samples/sec: 164.10 - lr: 0.000040
2023-07-26 16:09:15,364 epoch 15 - iter 72/243 - loss 0.29750206 - time (sec): 149.03 - samples/sec: 157.51 - lr: 0.000040
2023-07-26 16:10:06,024 epoch 15 - iter 96/243 - loss 0.29760832 - time (sec): 199.69 - samples/sec: 155.97 - lr: 0.000040
2023-07-26 16:10:56,205 epoch 15 - iter 120/243 - loss 0.29974418 - time (sec): 249.87 - samples/sec: 155.76 - lr: 0.000039
2023-07-26 16:11:43,301 epoch 15 - iter 144/243 - loss 0.29904887 - time (sec): 296.96 - samples/sec: 157.05 - lr: 0.000039
2023-07-26 16:12:31,170 epoch 15 - iter 168/243 - loss 0.29894209 - time (sec): 344.83 - samples/sec: 157.73 - lr: 0.000039
2023-07-26 16:13:20,187 epoch 15 - iter 192/243 - loss 0.29754010 - time (sec): 393.85 - samples/sec: 157.85 - lr: 0.000039
2023-07-26 16:14:09,012 epoch 15 - iter 216/243 - loss 0.29884402 - time (sec): 442.67 - samples/sec: 157.79 - lr: 0.000039
2023-07-26 16:14:57,878 epoch 15 - iter 240/243 - loss 0.29706337 - time (sec): 491.54 - samples/sec: 158.08 - lr: 0.000039
2023-07-26 16:15:03,351 ----------------------------------------------------------------------------------------------------
2023-07-26 16:15:03,351 EPOCH 15 done: loss 0.2971 - lr 0.000039
2023-07-26 16:15:05,134 Evaluating as a multi-label problem: False
2023-07-26 16:15:05,176 DEV : loss 0.21415923535823822 - f1-score (micro avg) 0.9737
2023-07-26 16:15:05,186 ----------------------------------------------------------------------------------------------------
2023-07-26 16:15:50,049 epoch 16 - iter 24/243 - loss 0.32918671 - time (sec): 44.86 - samples/sec: 172.79 - lr: 0.000039
2023-07-26 16:16:34,768 epoch 16 - iter 48/243 - loss 0.30668793 - time (sec): 89.58 - samples/sec: 172.52 - lr: 0.000039
2023-07-26 16:17:19,891 epoch 16 - iter 72/243 - loss 0.30165600 - time (sec): 134.70 - samples/sec: 171.72 - lr: 0.000039
2023-07-26 16:18:09,624 epoch 16 - iter 96/243 - loss 0.29977956 - time (sec): 184.44 - samples/sec: 168.02 - lr: 0.000038
2023-07-26 16:18:58,935 epoch 16 - iter 120/243 - loss 0.29035278 - time (sec): 233.75 - samples/sec: 165.52 - lr: 0.000038
2023-07-26 16:19:48,358 epoch 16 - iter 144/243 - loss 0.28688344 - time (sec): 283.17 - samples/sec: 164.52 - lr: 0.000038
2023-07-26 16:20:37,728 epoch 16 - iter 168/243 - loss 0.28573744 - time (sec): 332.54 - samples/sec: 163.65 - lr: 0.000038
2023-07-26 16:21:26,994 epoch 16 - iter 192/243 - loss 0.28483557 - time (sec): 381.81 - samples/sec: 162.65 - lr: 0.000038
2023-07-26 16:22:16,480 epoch 16 - iter 216/243 - loss 0.28487700 - time (sec): 431.29 - samples/sec: 162.23 - lr: 0.000038
2023-07-26 16:23:05,837 epoch 16 - iter 240/243 - loss 0.28570848 - time (sec): 480.65 - samples/sec: 161.78 - lr: 0.000038
2023-07-26 16:23:11,437 ----------------------------------------------------------------------------------------------------
2023-07-26 16:23:11,437 EPOCH 16 done: loss 0.2858 - lr 0.000038
2023-07-26 16:23:13,234 Evaluating as a multi-label problem: False
2023-07-26 16:23:13,276 DEV : loss 0.17488490045070648 - f1-score (micro avg) 0.9764
2023-07-26 16:23:13,286 ----------------------------------------------------------------------------------------------------
2023-07-26 16:23:58,069 epoch 17 - iter 24/243 - loss 0.28223418 - time (sec): 44.78 - samples/sec: 169.35 - lr: 0.000038
2023-07-26 16:24:42,914 epoch 17 - iter 48/243 - loss 0.28773045 - time (sec): 89.63 - samples/sec: 170.29 - lr: 0.000038
2023-07-26 16:25:28,001 epoch 17 - iter 72/243 - loss 0.28949629 - time (sec): 134.72 - samples/sec: 171.86 - lr: 0.000037
2023-07-26 16:26:12,604 epoch 17 - iter 96/243 - loss 0.29081122 - time (sec): 179.32 - samples/sec: 172.97 - lr: 0.000037
2023-07-26 16:26:57,287 epoch 17 - iter 120/243 - loss 0.28910214 - time (sec): 224.00 - samples/sec: 173.28 - lr: 0.000037
2023-07-26 16:27:41,994 epoch 17 - iter 144/243 - loss 0.28813940 - time (sec): 268.71 - samples/sec: 173.98 - lr: 0.000037
2023-07-26 16:28:26,701 epoch 17 - iter 168/243 - loss 0.28649377 - time (sec): 313.42 - samples/sec: 174.08 - lr: 0.000037
2023-07-26 16:29:11,540 epoch 17 - iter 192/243 - loss 0.28690817 - time (sec): 358.25 - samples/sec: 174.44 - lr: 0.000037
2023-07-26 16:29:56,114 epoch 17 - iter 216/243 - loss 0.28529445 - time (sec): 402.83 - samples/sec: 173.78 - lr: 0.000037
2023-07-26 16:30:40,993 epoch 17 - iter 240/243 - loss 0.28495055 - time (sec): 447.71 - samples/sec: 173.50 - lr: 0.000037
2023-07-26 16:30:46,121 ----------------------------------------------------------------------------------------------------
2023-07-26 16:30:46,122 EPOCH 17 done: loss 0.2845 - lr 0.000037
2023-07-26 16:30:47,874 Evaluating as a multi-label problem: False
2023-07-26 16:30:47,918 DEV : loss 0.1961992233991623 - f1-score (micro avg) 0.9764
2023-07-26 16:30:47,928 ----------------------------------------------------------------------------------------------------
2023-07-26 16:31:33,170 epoch 18 - iter 24/243 - loss 0.28778804 - time (sec): 45.24 - samples/sec: 183.77 - lr: 0.000037
2023-07-26 16:32:17,561 epoch 18 - iter 48/243 - loss 0.28633144 - time (sec): 89.63 - samples/sec: 178.16 - lr: 0.000036
2023-07-26 16:33:02,262 epoch 18 - iter 72/243 - loss 0.28829018 - time (sec): 134.33 - samples/sec: 176.29 - lr: 0.000036
2023-07-26 16:33:47,023 epoch 18 - iter 96/243 - loss 0.28737825 - time (sec): 179.10 - samples/sec: 176.55 - lr: 0.000036
2023-07-26 16:34:31,632 epoch 18 - iter 120/243 - loss 0.28870528 - time (sec): 223.70 - samples/sec: 176.96 - lr: 0.000036
2023-07-26 16:35:16,249 epoch 18 - iter 144/243 - loss 0.28536506 - time (sec): 268.32 - samples/sec: 176.48 - lr: 0.000036
2023-07-26 16:36:01,090 epoch 18 - iter 168/243 - loss 0.28612314 - time (sec): 313.16 - samples/sec: 175.92 - lr: 0.000036
2023-07-26 16:36:46,062 epoch 18 - iter 192/243 - loss 0.28681958 - time (sec): 358.13 - samples/sec: 174.98 - lr: 0.000036
2023-07-26 16:37:31,063 epoch 18 - iter 216/243 - loss 0.28815101 - time (sec): 403.14 - samples/sec: 174.53 - lr: 0.000036
2023-07-26 16:38:19,082 epoch 18 - iter 240/243 - loss 0.28697818 - time (sec): 451.15 - samples/sec: 172.46 - lr: 0.000036
2023-07-26 16:38:24,758 ----------------------------------------------------------------------------------------------------
2023-07-26 16:38:24,759 EPOCH 18 done: loss 0.2865 - lr 0.000036
2023-07-26 16:38:27,073 Evaluating as a multi-label problem: False
2023-07-26 16:38:27,115 DEV : loss 0.18113288283348083 - f1-score (micro avg) 0.9781
2023-07-26 16:38:27,126 saving best model
2023-07-26 16:38:30,288 ----------------------------------------------------------------------------------------------------
2023-07-26 16:39:21,782 epoch 19 - iter 24/243 - loss 0.28138164 - time (sec): 51.49 - samples/sec: 154.23 - lr: 0.000036
2023-07-26 16:40:12,650 epoch 19 - iter 48/243 - loss 0.28992986 - time (sec): 102.36 - samples/sec: 150.35 - lr: 0.000035
2023-07-26 16:41:02,164 epoch 19 - iter 72/243 - loss 0.28244605 - time (sec): 151.88 - samples/sec: 152.82 - lr: 0.000035
2023-07-26 16:41:52,390 epoch 19 - iter 96/243 - loss 0.28642854 - time (sec): 202.10 - samples/sec: 152.66 - lr: 0.000035
2023-07-26 16:42:44,635 epoch 19 - iter 120/243 - loss 0.28768114 - time (sec): 254.35 - samples/sec: 151.96 - lr: 0.000035
2023-07-26 16:43:33,907 epoch 19 - iter 144/243 - loss 0.28722806 - time (sec): 303.62 - samples/sec: 153.22 - lr: 0.000035
2023-07-26 16:44:23,000 epoch 19 - iter 168/243 - loss 0.28477685 - time (sec): 352.71 - samples/sec: 154.35 - lr: 0.000035
2023-07-26 16:45:11,847 epoch 19 - iter 192/243 - loss 0.28564618 - time (sec): 401.56 - samples/sec: 155.01 - lr: 0.000035
2023-07-26 16:46:00,662 epoch 19 - iter 216/243 - loss 0.28166734 - time (sec): 450.37 - samples/sec: 155.14 - lr: 0.000035
2023-07-26 16:46:49,519 epoch 19 - iter 240/243 - loss 0.28044622 - time (sec): 499.23 - samples/sec: 155.64 - lr: 0.000035
2023-07-26 16:46:55,052 ----------------------------------------------------------------------------------------------------
2023-07-26 16:46:55,052 EPOCH 19 done: loss 0.2808 - lr 0.000035
2023-07-26 16:46:56,840 Evaluating as a multi-label problem: False
2023-07-26 16:46:56,881 DEV : loss 0.2043328434228897 - f1-score (micro avg) 0.9793
2023-07-26 16:46:56,891 saving best model
2023-07-26 16:47:00,311 ----------------------------------------------------------------------------------------------------
2023-07-26 16:47:53,948 epoch 20 - iter 24/243 - loss 0.28666954 - time (sec): 53.64 - samples/sec: 145.20 - lr: 0.000034
2023-07-26 16:48:47,747 epoch 20 - iter 48/243 - loss 0.29481761 - time (sec): 107.44 - samples/sec: 143.54 - lr: 0.000034
2023-07-26 16:49:41,712 epoch 20 - iter 72/243 - loss 0.29914317 - time (sec): 161.40 - samples/sec: 143.67 - lr: 0.000034
2023-07-26 16:50:34,164 epoch 20 - iter 96/243 - loss 0.29393948 - time (sec): 213.85 - samples/sec: 144.15 - lr: 0.000034
2023-07-26 16:51:26,758 epoch 20 - iter 120/243 - loss 0.29259273 - time (sec): 266.45 - samples/sec: 144.69 - lr: 0.000034
2023-07-26 16:52:19,496 epoch 20 - iter 144/243 - loss 0.29189521 - time (sec): 319.18 - samples/sec: 145.56 - lr: 0.000034
2023-07-26 16:53:12,248 epoch 20 - iter 168/243 - loss 0.29174956 - time (sec): 371.94 - samples/sec: 146.27 - lr: 0.000034
2023-07-26 16:54:04,770 epoch 20 - iter 192/243 - loss 0.28991116 - time (sec): 424.46 - samples/sec: 146.24 - lr: 0.000034
2023-07-26 16:54:57,220 epoch 20 - iter 216/243 - loss 0.28908421 - time (sec): 476.91 - samples/sec: 146.03 - lr: 0.000034
2023-07-26 16:55:50,110 epoch 20 - iter 240/243 - loss 0.28802142 - time (sec): 529.80 - samples/sec: 146.82 - lr: 0.000033
2023-07-26 16:55:56,063 ----------------------------------------------------------------------------------------------------
2023-07-26 16:55:56,064 EPOCH 20 done: loss 0.2884 - lr 0.000033
2023-07-26 16:55:58,153 Evaluating as a multi-label problem: False
2023-07-26 16:55:58,197 DEV : loss 0.17976026237010956 - f1-score (micro avg) 0.9798
2023-07-26 16:55:58,210 saving best model
2023-07-26 16:56:01,163 ----------------------------------------------------------------------------------------------------
2023-07-26 16:56:45,917 epoch 21 - iter 24/243 - loss 0.27074814 - time (sec): 44.75 - samples/sec: 174.64 - lr: 0.000033
2023-07-26 16:57:30,503 epoch 21 - iter 48/243 - loss 0.27757152 - time (sec): 89.34 - samples/sec: 172.96 - lr: 0.000033
2023-07-26 16:58:15,097 epoch 21 - iter 72/243 - loss 0.27454337 - time (sec): 133.93 - samples/sec: 173.08 - lr: 0.000033
2023-07-26 16:58:59,717 epoch 21 - iter 96/243 - loss 0.27609707 - time (sec): 178.55 - samples/sec: 172.80 - lr: 0.000033
2023-07-26 16:59:44,372 epoch 21 - iter 120/243 - loss 0.27224083 - time (sec): 223.21 - samples/sec: 172.96 - lr: 0.000033
2023-07-26 17:00:29,083 epoch 21 - iter 144/243 - loss 0.27850149 - time (sec): 267.92 - samples/sec: 172.72 - lr: 0.000033
2023-07-26 17:01:13,636 epoch 21 - iter 168/243 - loss 0.27696398 - time (sec): 312.47 - samples/sec: 172.79 - lr: 0.000033
2023-07-26 17:01:58,291 epoch 21 - iter 192/243 - loss 0.27664755 - time (sec): 357.13 - samples/sec: 172.80 - lr: 0.000033
2023-07-26 17:02:43,178 epoch 21 - iter 216/243 - loss 0.27558848 - time (sec): 402.01 - samples/sec: 173.76 - lr: 0.000032
2023-07-26 17:03:27,865 epoch 21 - iter 240/243 - loss 0.27583214 - time (sec): 446.70 - samples/sec: 173.99 - lr: 0.000032
2023-07-26 17:03:32,964 ----------------------------------------------------------------------------------------------------
2023-07-26 17:03:32,964 EPOCH 21 done: loss 0.2761 - lr 0.000032
2023-07-26 17:03:34,719 Evaluating as a multi-label problem: False
2023-07-26 17:03:34,761 DEV : loss 0.20532046258449554 - f1-score (micro avg) 0.9808
2023-07-26 17:03:34,770 saving best model
2023-07-26 17:03:38,172 ----------------------------------------------------------------------------------------------------
2023-07-26 17:04:22,817 epoch 22 - iter 24/243 - loss 0.27909847 - time (sec): 44.64 - samples/sec: 173.08 - lr: 0.000032
2023-07-26 17:05:07,696 epoch 22 - iter 48/243 - loss 0.27692541 - time (sec): 89.52 - samples/sec: 175.94 - lr: 0.000032
2023-07-26 17:05:52,516 epoch 22 - iter 72/243 - loss 0.27632545 - time (sec): 134.34 - samples/sec: 175.36 - lr: 0.000032
2023-07-26 17:06:37,349 epoch 22 - iter 96/243 - loss 0.27607549 - time (sec): 179.18 - samples/sec: 175.31 - lr: 0.000032
2023-07-26 17:07:22,028 epoch 22 - iter 120/243 - loss 0.27687957 - time (sec): 223.85 - samples/sec: 175.34 - lr: 0.000032
2023-07-26 17:08:06,628 epoch 22 - iter 144/243 - loss 0.27294774 - time (sec): 268.46 - samples/sec: 174.93 - lr: 0.000032
2023-07-26 17:08:51,184 epoch 22 - iter 168/243 - loss 0.27391471 - time (sec): 313.01 - samples/sec: 174.15 - lr: 0.000032
2023-07-26 17:09:35,805 epoch 22 - iter 192/243 - loss 0.27352263 - time (sec): 357.63 - samples/sec: 174.01 - lr: 0.000031
2023-07-26 17:10:20,566 epoch 22 - iter 216/243 - loss 0.27144978 - time (sec): 402.39 - samples/sec: 174.04 - lr: 0.000031
2023-07-26 17:11:05,178 epoch 22 - iter 240/243 - loss 0.27338785 - time (sec): 447.01 - samples/sec: 173.85 - lr: 0.000031
2023-07-26 17:11:10,275 ----------------------------------------------------------------------------------------------------
2023-07-26 17:11:10,275 EPOCH 22 done: loss 0.2738 - lr 0.000031
2023-07-26 17:11:12,042 Evaluating as a multi-label problem: False
2023-07-26 17:11:12,084 DEV : loss 0.20975473523139954 - f1-score (micro avg) 0.9771
2023-07-26 17:11:12,094 ----------------------------------------------------------------------------------------------------
2023-07-26 17:11:57,033 epoch 23 - iter 24/243 - loss 0.28534317 - time (sec): 44.94 - samples/sec: 175.57 - lr: 0.000031
2023-07-26 17:12:41,709 epoch 23 - iter 48/243 - loss 0.28084455 - time (sec): 89.61 - samples/sec: 173.42 - lr: 0.000031
2023-07-26 17:13:26,426 epoch 23 - iter 72/243 - loss 0.28011749 - time (sec): 134.33 - samples/sec: 173.68 - lr: 0.000031
2023-07-26 17:14:10,996 epoch 23 - iter 96/243 - loss 0.28443955 - time (sec): 178.90 - samples/sec: 173.25 - lr: 0.000031
2023-07-26 17:14:55,898 epoch 23 - iter 120/243 - loss 0.28290269 - time (sec): 223.80 - samples/sec: 173.90 - lr: 0.000031
2023-07-26 17:15:40,508 epoch 23 - iter 144/243 - loss 0.28079246 - time (sec): 268.41 - samples/sec: 173.44 - lr: 0.000031
2023-07-26 17:16:25,384 epoch 23 - iter 168/243 - loss 0.27982769 - time (sec): 313.29 - samples/sec: 173.93 - lr: 0.000030
2023-07-26 17:17:10,020 epoch 23 - iter 192/243 - loss 0.27685678 - time (sec): 357.93 - samples/sec: 173.50 - lr: 0.000030
2023-07-26 17:17:54,847 epoch 23 - iter 216/243 - loss 0.27359946 - time (sec): 402.75 - samples/sec: 173.94 - lr: 0.000030
2023-07-26 17:18:39,474 epoch 23 - iter 240/243 - loss 0.27378796 - time (sec): 447.38 - samples/sec: 173.62 - lr: 0.000030
2023-07-26 17:18:44,594 ----------------------------------------------------------------------------------------------------
2023-07-26 17:18:44,594 EPOCH 23 done: loss 0.2739 - lr 0.000030
2023-07-26 17:18:46,344 Evaluating as a multi-label problem: False
2023-07-26 17:18:46,386 DEV : loss 0.21456189453601837 - f1-score (micro avg) 0.9796
2023-07-26 17:18:46,395 ----------------------------------------------------------------------------------------------------
2023-07-26 17:19:31,051 epoch 24 - iter 24/243 - loss 0.28123621 - time (sec): 44.66 - samples/sec: 168.56 - lr: 0.000030
2023-07-26 17:20:15,553 epoch 24 - iter 48/243 - loss 0.27128197 - time (sec): 89.16 - samples/sec: 168.93 - lr: 0.000030
2023-07-26 17:21:00,218 epoch 24 - iter 72/243 - loss 0.26742573 - time (sec): 133.82 - samples/sec: 169.68 - lr: 0.000030
2023-07-26 17:21:44,804 epoch 24 - iter 96/243 - loss 0.27426501 - time (sec): 178.41 - samples/sec: 170.21 - lr: 0.000030
2023-07-26 17:22:29,693 epoch 24 - iter 120/243 - loss 0.26958800 - time (sec): 223.30 - samples/sec: 171.86 - lr: 0.000030
2023-07-26 17:23:14,736 epoch 24 - iter 144/243 - loss 0.27011544 - time (sec): 268.34 - samples/sec: 174.09 - lr: 0.000029
2023-07-26 17:23:59,891 epoch 24 - iter 168/243 - loss 0.26573691 - time (sec): 313.50 - samples/sec: 173.54 - lr: 0.000029
2023-07-26 17:24:44,440 epoch 24 - iter 192/243 - loss 0.26424698 - time (sec): 358.04 - samples/sec: 173.71 - lr: 0.000029
2023-07-26 17:25:28,792 epoch 24 - iter 216/243 - loss 0.26555746 - time (sec): 402.40 - samples/sec: 173.43 - lr: 0.000029
2023-07-26 17:26:13,338 epoch 24 - iter 240/243 - loss 0.26918457 - time (sec): 446.94 - samples/sec: 173.77 - lr: 0.000029
2023-07-26 17:26:18,446 ----------------------------------------------------------------------------------------------------
2023-07-26 17:26:18,447 EPOCH 24 done: loss 0.2696 - lr 0.000029
2023-07-26 17:26:20,206 Evaluating as a multi-label problem: False
2023-07-26 17:26:20,252 DEV : loss 0.21408958733081818 - f1-score (micro avg) 0.9788
2023-07-26 17:26:20,263 ----------------------------------------------------------------------------------------------------
2023-07-26 17:27:04,792 epoch 25 - iter 24/243 - loss 0.26057600 - time (sec): 44.53 - samples/sec: 175.66 - lr: 0.000029
2023-07-26 17:27:49,230 epoch 25 - iter 48/243 - loss 0.25988897 - time (sec): 88.97 - samples/sec: 175.54 - lr: 0.000029
2023-07-26 17:28:34,576 epoch 25 - iter 72/243 - loss 0.26336622 - time (sec): 134.31 - samples/sec: 174.81 - lr: 0.000029
2023-07-26 17:29:19,911 epoch 25 - iter 96/243 - loss 0.26126366 - time (sec): 179.65 - samples/sec: 174.70 - lr: 0.000029
2023-07-26 17:30:05,863 epoch 25 - iter 120/243 - loss 0.26114761 - time (sec): 225.60 - samples/sec: 173.32 - lr: 0.000028
2023-07-26 17:30:54,836 epoch 25 - iter 144/243 - loss 0.26019042 - time (sec): 274.57 - samples/sec: 170.53 - lr: 0.000028
2023-07-26 17:31:44,973 epoch 25 - iter 168/243 - loss 0.26060643 - time (sec): 324.71 - samples/sec: 168.00 - lr: 0.000028
2023-07-26 17:32:34,267 epoch 25 - iter 192/243 - loss 0.26158525 - time (sec): 374.00 - samples/sec: 167.12 - lr: 0.000028
2023-07-26 17:33:23,148 epoch 25 - iter 216/243 - loss 0.25965178 - time (sec): 422.89 - samples/sec: 165.83 - lr: 0.000028
2023-07-26 17:34:11,558 epoch 25 - iter 240/243 - loss 0.25991617 - time (sec): 471.29 - samples/sec: 165.08 - lr: 0.000028
2023-07-26 17:34:17,049 ----------------------------------------------------------------------------------------------------
2023-07-26 17:34:17,049 EPOCH 25 done: loss 0.2605 - lr 0.000028
2023-07-26 17:34:18,858 Evaluating as a multi-label problem: False
2023-07-26 17:34:18,901 DEV : loss 0.20778048038482666 - f1-score (micro avg) 0.9801
2023-07-26 17:34:18,911 ----------------------------------------------------------------------------------------------------
2023-07-26 17:35:06,124 epoch 26 - iter 24/243 - loss 0.25028245 - time (sec): 47.21 - samples/sec: 162.56 - lr: 0.000028
2023-07-26 17:35:56,450 epoch 26 - iter 48/243 - loss 0.26759368 - time (sec): 97.54 - samples/sec: 159.39 - lr: 0.000028
2023-07-26 17:36:48,254 epoch 26 - iter 72/243 - loss 0.26240750 - time (sec): 149.34 - samples/sec: 155.33 - lr: 0.000028
2023-07-26 17:37:37,860 epoch 26 - iter 96/243 - loss 0.26499737 - time (sec): 198.95 - samples/sec: 155.95 - lr: 0.000027
2023-07-26 17:38:26,594 epoch 26 - iter 120/243 - loss 0.26765442 - time (sec): 247.68 - samples/sec: 155.61 - lr: 0.000027
2023-07-26 17:39:15,951 epoch 26 - iter 144/243 - loss 0.26496660 - time (sec): 297.04 - samples/sec: 155.98 - lr: 0.000027
2023-07-26 17:40:04,512 epoch 26 - iter 168/243 - loss 0.26407033 - time (sec): 345.60 - samples/sec: 157.09 - lr: 0.000027
2023-07-26 17:40:52,402 epoch 26 - iter 192/243 - loss 0.26463487 - time (sec): 393.49 - samples/sec: 157.76 - lr: 0.000027
2023-07-26 17:41:40,356 epoch 26 - iter 216/243 - loss 0.26192074 - time (sec): 441.45 - samples/sec: 158.66 - lr: 0.000027
2023-07-26 17:42:28,247 epoch 26 - iter 240/243 - loss 0.26299030 - time (sec): 489.34 - samples/sec: 158.86 - lr: 0.000027
2023-07-26 17:42:33,832 ----------------------------------------------------------------------------------------------------
2023-07-26 17:42:33,833 EPOCH 26 done: loss 0.2631 - lr 0.000027
2023-07-26 17:42:35,630 Evaluating as a multi-label problem: False
2023-07-26 17:42:35,672 DEV : loss 0.22401468455791473 - f1-score (micro avg) 0.9786
2023-07-26 17:42:35,682 ----------------------------------------------------------------------------------------------------
2023-07-26 17:43:20,454 epoch 27 - iter 24/243 - loss 0.26639657 - time (sec): 44.77 - samples/sec: 182.81 - lr: 0.000027
2023-07-26 17:44:04,934 epoch 27 - iter 48/243 - loss 0.27451501 - time (sec): 89.25 - samples/sec: 178.70 - lr: 0.000027
2023-07-26 17:44:49,425 epoch 27 - iter 72/243 - loss 0.27289399 - time (sec): 133.74 - samples/sec: 176.62 - lr: 0.000026
2023-07-26 17:45:33,681 epoch 27 - iter 96/243 - loss 0.27091536 - time (sec): 178.00 - samples/sec: 175.50 - lr: 0.000026
2023-07-26 17:46:18,171 epoch 27 - iter 120/243 - loss 0.27191898 - time (sec): 222.49 - samples/sec: 173.82 - lr: 0.000026
2023-07-26 17:47:02,640 epoch 27 - iter 144/243 - loss 0.27013358 - time (sec): 266.96 - samples/sec: 173.92 - lr: 0.000026
2023-07-26 17:47:47,032 epoch 27 - iter 168/243 - loss 0.26766038 - time (sec): 311.35 - samples/sec: 173.58 - lr: 0.000026
2023-07-26 17:48:33,707 epoch 27 - iter 192/243 - loss 0.26602770 - time (sec): 358.02 - samples/sec: 173.06 - lr: 0.000026
2023-07-26 17:49:21,690 epoch 27 - iter 216/243 - loss 0.26757355 - time (sec): 406.01 - samples/sec: 171.85 - lr: 0.000026
2023-07-26 17:50:09,653 epoch 27 - iter 240/243 - loss 0.26544815 - time (sec): 453.97 - samples/sec: 171.19 - lr: 0.000026
2023-07-26 17:50:15,122 ----------------------------------------------------------------------------------------------------
2023-07-26 17:50:15,123 EPOCH 27 done: loss 0.2656 - lr 0.000026
2023-07-26 17:50:17,372 Evaluating as a multi-label problem: False
2023-07-26 17:50:17,414 DEV : loss 0.2324327975511551 - f1-score (micro avg) 0.9771
2023-07-26 17:50:17,424 ----------------------------------------------------------------------------------------------------
2023-07-26 17:51:02,154 epoch 28 - iter 24/243 - loss 0.26044359 - time (sec): 44.73 - samples/sec: 177.24 - lr: 0.000026
2023-07-26 17:51:46,725 epoch 28 - iter 48/243 - loss 0.25192260 - time (sec): 89.30 - samples/sec: 175.55 - lr: 0.000025
2023-07-26 17:52:31,357 epoch 28 - iter 72/243 - loss 0.24867911 - time (sec): 133.93 - samples/sec: 175.88 - lr: 0.000025
2023-07-26 17:53:15,933 epoch 28 - iter 96/243 - loss 0.25204485 - time (sec): 178.51 - samples/sec: 175.73 - lr: 0.000025
2023-07-26 17:54:00,443 epoch 28 - iter 120/243 - loss 0.24981817 - time (sec): 223.02 - samples/sec: 174.90 - lr: 0.000025
2023-07-26 17:54:44,958 epoch 28 - iter 144/243 - loss 0.25157168 - time (sec): 267.53 - samples/sec: 174.46 - lr: 0.000025
2023-07-26 17:55:29,493 epoch 28 - iter 168/243 - loss 0.25440998 - time (sec): 312.07 - samples/sec: 174.04 - lr: 0.000025
2023-07-26 17:56:13,998 epoch 28 - iter 192/243 - loss 0.25791455 - time (sec): 356.57 - samples/sec: 174.06 - lr: 0.000025
2023-07-26 17:56:58,663 epoch 28 - iter 216/243 - loss 0.26113615 - time (sec): 401.24 - samples/sec: 173.82 - lr: 0.000025
2023-07-26 17:57:43,598 epoch 28 - iter 240/243 - loss 0.26254906 - time (sec): 446.17 - samples/sec: 174.40 - lr: 0.000025
2023-07-26 17:57:48,629 ----------------------------------------------------------------------------------------------------
2023-07-26 17:57:48,629 EPOCH 28 done: loss 0.2628 - lr 0.000025
2023-07-26 17:57:50,384 Evaluating as a multi-label problem: False
2023-07-26 17:57:50,427 DEV : loss 0.21640333533287048 - f1-score (micro avg) 0.9803
2023-07-26 17:57:50,437 ----------------------------------------------------------------------------------------------------
2023-07-26 17:58:34,969 epoch 29 - iter 24/243 - loss 0.24833162 - time (sec): 44.53 - samples/sec: 173.47 - lr: 0.000024
2023-07-26 17:59:19,469 epoch 29 - iter 48/243 - loss 0.25554505 - time (sec): 89.03 - samples/sec: 173.26 - lr: 0.000024
2023-07-26 18:00:04,033 epoch 29 - iter 72/243 - loss 0.26313723 - time (sec): 133.60 - samples/sec: 173.10 - lr: 0.000024
2023-07-26 18:00:48,651 epoch 29 - iter 96/243 - loss 0.26456129 - time (sec): 178.21 - samples/sec: 173.90 - lr: 0.000024
2023-07-26 18:01:33,121 epoch 29 - iter 120/243 - loss 0.26539430 - time (sec): 222.68 - samples/sec: 173.48 - lr: 0.000024
2023-07-26 18:02:17,661 epoch 29 - iter 144/243 - loss 0.26756174 - time (sec): 267.22 - samples/sec: 173.79 - lr: 0.000024
2023-07-26 18:03:02,505 epoch 29 - iter 168/243 - loss 0.26309703 - time (sec): 312.07 - samples/sec: 174.46 - lr: 0.000024
2023-07-26 18:03:46,972 epoch 29 - iter 192/243 - loss 0.26532971 - time (sec): 356.53 - samples/sec: 173.68 - lr: 0.000024
2023-07-26 18:04:31,621 epoch 29 - iter 216/243 - loss 0.26648227 - time (sec): 401.18 - samples/sec: 173.71 - lr: 0.000024
2023-07-26 18:05:16,534 epoch 29 - iter 240/243 - loss 0.26528743 - time (sec): 446.10 - samples/sec: 174.44 - lr: 0.000023
2023-07-26 18:05:21,587 ----------------------------------------------------------------------------------------------------
2023-07-26 18:05:21,587 EPOCH 29 done: loss 0.2655 - lr 0.000023
2023-07-26 18:05:23,600 Evaluating as a multi-label problem: False
2023-07-26 18:05:23,646 DEV : loss 0.24248327314853668 - f1-score (micro avg) 0.9796
2023-07-26 18:05:23,660 ----------------------------------------------------------------------------------------------------
2023-07-26 18:06:12,165 epoch 30 - iter 24/243 - loss 0.26154968 - time (sec): 48.51 - samples/sec: 161.26 - lr: 0.000023
2023-07-26 18:07:02,942 epoch 30 - iter 48/243 - loss 0.27126768 - time (sec): 99.28 - samples/sec: 157.70 - lr: 0.000023
2023-07-26 18:07:57,979 epoch 30 - iter 72/243 - loss 0.27468039 - time (sec): 154.32 - samples/sec: 150.27 - lr: 0.000023
2023-07-26 18:08:53,322 epoch 30 - iter 96/243 - loss 0.27662270 - time (sec): 209.66 - samples/sec: 147.69 - lr: 0.000023
2023-07-26 18:09:48,639 epoch 30 - iter 120/243 - loss 0.27403633 - time (sec): 264.98 - samples/sec: 145.88 - lr: 0.000023
2023-07-26 18:10:40,050 epoch 30 - iter 144/243 - loss 0.27461637 - time (sec): 316.39 - samples/sec: 146.57 - lr: 0.000023
2023-07-26 18:11:29,552 epoch 30 - iter 168/243 - loss 0.26994770 - time (sec): 365.89 - samples/sec: 148.67 - lr: 0.000023
2023-07-26 18:12:18,746 epoch 30 - iter 192/243 - loss 0.26952319 - time (sec): 415.09 - samples/sec: 150.29 - lr: 0.000023
2023-07-26 18:13:07,757 epoch 30 - iter 216/243 - loss 0.26556592 - time (sec): 464.10 - samples/sec: 151.23 - lr: 0.000022
2023-07-26 18:13:56,449 epoch 30 - iter 240/243 - loss 0.26521277 - time (sec): 512.79 - samples/sec: 151.74 - lr: 0.000022
2023-07-26 18:14:01,871 ----------------------------------------------------------------------------------------------------
2023-07-26 18:14:01,871 EPOCH 30 done: loss 0.2653 - lr 0.000022
2023-07-26 18:14:03,693 Evaluating as a multi-label problem: False
2023-07-26 18:14:03,735 DEV : loss 0.23393450677394867 - f1-score (micro avg) 0.9776
2023-07-26 18:14:03,746 ----------------------------------------------------------------------------------------------------
2023-07-26 18:14:48,764 epoch 31 - iter 24/243 - loss 0.24073944 - time (sec): 45.02 - samples/sec: 179.77 - lr: 0.000022
2023-07-26 18:15:33,209 epoch 31 - iter 48/243 - loss 0.24507990 - time (sec): 89.46 - samples/sec: 173.00 - lr: 0.000022
2023-07-26 18:16:17,809 epoch 31 - iter 72/243 - loss 0.25127541 - time (sec): 134.06 - samples/sec: 173.96 - lr: 0.000022
2023-07-26 18:17:02,650 epoch 31 - iter 96/243 - loss 0.25526836 - time (sec): 178.90 - samples/sec: 175.19 - lr: 0.000022
2023-07-26 18:17:47,365 epoch 31 - iter 120/243 - loss 0.25884615 - time (sec): 223.62 - samples/sec: 174.90 - lr: 0.000022
2023-07-26 18:18:32,093 epoch 31 - iter 144/243 - loss 0.26107421 - time (sec): 268.35 - samples/sec: 174.71 - lr: 0.000022
2023-07-26 18:19:16,568 epoch 31 - iter 168/243 - loss 0.25772191 - time (sec): 312.82 - samples/sec: 174.07 - lr: 0.000022
2023-07-26 18:20:01,232 epoch 31 - iter 192/243 - loss 0.25843953 - time (sec): 357.49 - samples/sec: 174.12 - lr: 0.000021
2023-07-26 18:20:46,098 epoch 31 - iter 216/243 - loss 0.25940033 - time (sec): 402.35 - samples/sec: 174.28 - lr: 0.000021
2023-07-26 18:21:30,680 epoch 31 - iter 240/243 - loss 0.25924131 - time (sec): 446.93 - samples/sec: 173.95 - lr: 0.000021
2023-07-26 18:21:35,753 ----------------------------------------------------------------------------------------------------
2023-07-26 18:21:35,753 EPOCH 31 done: loss 0.2594 - lr 0.000021
2023-07-26 18:21:37,502 Evaluating as a multi-label problem: False
2023-07-26 18:21:37,544 DEV : loss 0.22774212062358856 - f1-score (micro avg) 0.9788
2023-07-26 18:21:37,554 ----------------------------------------------------------------------------------------------------
2023-07-26 18:22:22,282 epoch 32 - iter 24/243 - loss 0.25476998 - time (sec): 44.73 - samples/sec: 179.17 - lr: 0.000021
2023-07-26 18:23:07,025 epoch 32 - iter 48/243 - loss 0.25629909 - time (sec): 89.47 - samples/sec: 178.31 - lr: 0.000021
2023-07-26 18:23:51,761 epoch 32 - iter 72/243 - loss 0.25739595 - time (sec): 134.21 - samples/sec: 177.13 - lr: 0.000021
2023-07-26 18:24:36,312 epoch 32 - iter 96/243 - loss 0.26207122 - time (sec): 178.76 - samples/sec: 175.24 - lr: 0.000021
2023-07-26 18:25:20,955 epoch 32 - iter 120/243 - loss 0.26238445 - time (sec): 223.40 - samples/sec: 175.45 - lr: 0.000021
2023-07-26 18:26:05,680 epoch 32 - iter 144/243 - loss 0.26421827 - time (sec): 268.13 - samples/sec: 174.45 - lr: 0.000021
2023-07-26 18:26:50,600 epoch 32 - iter 168/243 - loss 0.26554256 - time (sec): 313.05 - samples/sec: 175.05 - lr: 0.000020
2023-07-26 18:27:37,550 epoch 32 - iter 192/243 - loss 0.26682748 - time (sec): 360.00 - samples/sec: 173.67 - lr: 0.000020
2023-07-26 18:28:26,938 epoch 32 - iter 216/243 - loss 0.26495455 - time (sec): 409.38 - samples/sec: 172.06 - lr: 0.000020
2023-07-26 18:29:15,763 epoch 32 - iter 240/243 - loss 0.26526827 - time (sec): 458.21 - samples/sec: 169.70 - lr: 0.000020
2023-07-26 18:29:21,316 ----------------------------------------------------------------------------------------------------
2023-07-26 18:29:21,316 EPOCH 32 done: loss 0.2646 - lr 0.000020
2023-07-26 18:29:23,143 Evaluating as a multi-label problem: False
2023-07-26 18:29:23,187 DEV : loss 0.22920973598957062 - f1-score (micro avg) 0.9793
2023-07-26 18:29:23,197 ----------------------------------------------------------------------------------------------------
2023-07-26 18:30:10,600 epoch 33 - iter 24/243 - loss 0.26866868 - time (sec): 47.40 - samples/sec: 165.35 - lr: 0.000020
2023-07-26 18:30:58,341 epoch 33 - iter 48/243 - loss 0.25914800 - time (sec): 95.14 - samples/sec: 161.12 - lr: 0.000020
2023-07-26 18:31:46,238 epoch 33 - iter 72/243 - loss 0.25631313 - time (sec): 143.04 - samples/sec: 161.25 - lr: 0.000020
2023-07-26 18:32:38,739 epoch 33 - iter 96/243 - loss 0.25455371 - time (sec): 195.54 - samples/sec: 158.90 - lr: 0.000020
2023-07-26 18:33:26,705 epoch 33 - iter 120/243 - loss 0.25585405 - time (sec): 243.51 - samples/sec: 159.48 - lr: 0.000020
2023-07-26 18:34:14,895 epoch 33 - iter 144/243 - loss 0.25945055 - time (sec): 291.70 - samples/sec: 159.74 - lr: 0.000019
2023-07-26 18:35:02,659 epoch 33 - iter 168/243 - loss 0.25932428 - time (sec): 339.46 - samples/sec: 159.76 - lr: 0.000019
2023-07-26 18:35:50,532 epoch 33 - iter 192/243 - loss 0.25851724 - time (sec): 387.33 - samples/sec: 160.31 - lr: 0.000019
2023-07-26 18:36:38,327 epoch 33 - iter 216/243 - loss 0.25678080 - time (sec): 435.13 - samples/sec: 160.50 - lr: 0.000019
2023-07-26 18:37:26,262 epoch 33 - iter 240/243 - loss 0.25562158 - time (sec): 483.06 - samples/sec: 160.86 - lr: 0.000019
2023-07-26 18:37:31,732 ----------------------------------------------------------------------------------------------------
2023-07-26 18:37:31,732 EPOCH 33 done: loss 0.2552 - lr 0.000019
2023-07-26 18:37:33,524 Evaluating as a multi-label problem: False
2023-07-26 18:37:33,566 DEV : loss 0.23627179861068726 - f1-score (micro avg) 0.9791
2023-07-26 18:37:33,576 ----------------------------------------------------------------------------------------------------
2023-07-26 18:38:18,104 epoch 34 - iter 24/243 - loss 0.27182899 - time (sec): 44.53 - samples/sec: 177.66 - lr: 0.000019
2023-07-26 18:39:02,789 epoch 34 - iter 48/243 - loss 0.27027922 - time (sec): 89.21 - samples/sec: 177.42 - lr: 0.000019
2023-07-26 18:39:47,401 epoch 34 - iter 72/243 - loss 0.26451951 - time (sec): 133.83 - samples/sec: 176.71 - lr: 0.000019
2023-07-26 18:40:31,661 epoch 34 - iter 96/243 - loss 0.26736759 - time (sec): 178.09 - samples/sec: 174.31 - lr: 0.000019
2023-07-26 18:41:16,196 epoch 34 - iter 120/243 - loss 0.26439071 - time (sec): 222.62 - samples/sec: 174.62 - lr: 0.000018
2023-07-26 18:42:00,770 epoch 34 - iter 144/243 - loss 0.26033732 - time (sec): 267.19 - samples/sec: 174.48 - lr: 0.000018
2023-07-26 18:42:45,441 epoch 34 - iter 168/243 - loss 0.25756053 - time (sec): 311.87 - samples/sec: 174.19 - lr: 0.000018
2023-07-26 18:43:30,194 epoch 34 - iter 192/243 - loss 0.26053780 - time (sec): 356.62 - samples/sec: 174.51 - lr: 0.000018
2023-07-26 18:44:14,725 epoch 34 - iter 216/243 - loss 0.26079037 - time (sec): 401.15 - samples/sec: 174.64 - lr: 0.000018
2023-07-26 18:44:59,292 epoch 34 - iter 240/243 - loss 0.25971768 - time (sec): 445.72 - samples/sec: 174.39 - lr: 0.000018
2023-07-26 18:45:04,380 ----------------------------------------------------------------------------------------------------
2023-07-26 18:45:04,380 EPOCH 34 done: loss 0.2595 - lr 0.000018
2023-07-26 18:45:06,131 Evaluating as a multi-label problem: False
2023-07-26 18:45:06,173 DEV : loss 0.23955273628234863 - f1-score (micro avg) 0.9796
2023-07-26 18:45:06,183 ----------------------------------------------------------------------------------------------------
2023-07-26 18:45:50,882 epoch 35 - iter 24/243 - loss 0.26701266 - time (sec): 44.70 - samples/sec: 178.66 - lr: 0.000018
2023-07-26 18:46:35,519 epoch 35 - iter 48/243 - loss 0.25211759 - time (sec): 89.34 - samples/sec: 175.67 - lr: 0.000018
2023-07-26 18:47:20,251 epoch 35 - iter 72/243 - loss 0.25876122 - time (sec): 134.07 - samples/sec: 175.92 - lr: 0.000018
2023-07-26 18:48:04,922 epoch 35 - iter 96/243 - loss 0.25751966 - time (sec): 178.74 - samples/sec: 175.77 - lr: 0.000017
2023-07-26 18:48:49,416 epoch 35 - iter 120/243 - loss 0.25782676 - time (sec): 223.23 - samples/sec: 174.59 - lr: 0.000017
2023-07-26 18:49:34,049 epoch 35 - iter 144/243 - loss 0.26020302 - time (sec): 267.87 - samples/sec: 174.72 - lr: 0.000017
2023-07-26 18:50:18,677 epoch 35 - iter 168/243 - loss 0.26431905 - time (sec): 312.49 - samples/sec: 175.29 - lr: 0.000017
2023-07-26 18:51:03,300 epoch 35 - iter 192/243 - loss 0.26060801 - time (sec): 357.12 - samples/sec: 175.10 - lr: 0.000017
2023-07-26 18:51:47,857 epoch 35 - iter 216/243 - loss 0.26100924 - time (sec): 401.67 - samples/sec: 174.60 - lr: 0.000017
2023-07-26 18:52:32,385 epoch 35 - iter 240/243 - loss 0.26071736 - time (sec): 446.20 - samples/sec: 174.23 - lr: 0.000017
2023-07-26 18:52:37,453 ----------------------------------------------------------------------------------------------------
2023-07-26 18:52:37,454 EPOCH 35 done: loss 0.2611 - lr 0.000017
2023-07-26 18:52:39,658 Evaluating as a multi-label problem: False
2023-07-26 18:52:39,699 DEV : loss 0.24450713396072388 - f1-score (micro avg) 0.9791
2023-07-26 18:52:39,709 ----------------------------------------------------------------------------------------------------
2023-07-26 18:53:24,264 epoch 36 - iter 24/243 - loss 0.27084705 - time (sec): 44.55 - samples/sec: 175.18 - lr: 0.000017
2023-07-26 18:54:08,663 epoch 36 - iter 48/243 - loss 0.25947400 - time (sec): 88.95 - samples/sec: 173.11 - lr: 0.000017
2023-07-26 18:54:53,501 epoch 36 - iter 72/243 - loss 0.25687195 - time (sec): 133.79 - samples/sec: 175.35 - lr: 0.000016
2023-07-26 18:55:37,893 epoch 36 - iter 96/243 - loss 0.25424198 - time (sec): 178.18 - samples/sec: 173.93 - lr: 0.000016
2023-07-26 18:56:22,286 epoch 36 - iter 120/243 - loss 0.25557169 - time (sec): 222.58 - samples/sec: 173.34 - lr: 0.000016
2023-07-26 18:57:15,000 epoch 36 - iter 144/243 - loss 0.25787383 - time (sec): 275.29 - samples/sec: 168.90 - lr: 0.000016
2023-07-26 18:58:08,183 epoch 36 - iter 168/243 - loss 0.25642415 - time (sec): 328.47 - samples/sec: 165.37 - lr: 0.000016
2023-07-26 18:59:02,250 epoch 36 - iter 192/243 - loss 0.25543523 - time (sec): 382.54 - samples/sec: 162.77 - lr: 0.000016
2023-07-26 18:59:53,084 epoch 36 - iter 216/243 - loss 0.25443060 - time (sec): 433.38 - samples/sec: 161.58 - lr: 0.000016
2023-07-26 19:00:41,508 epoch 36 - iter 240/243 - loss 0.25344304 - time (sec): 481.80 - samples/sec: 161.25 - lr: 0.000016
2023-07-26 19:00:47,029 ----------------------------------------------------------------------------------------------------
2023-07-26 19:00:47,029 EPOCH 36 done: loss 0.2536 - lr 0.000016
2023-07-26 19:00:48,817 Evaluating as a multi-label problem: False
2023-07-26 19:00:48,859 DEV : loss 0.2530966103076935 - f1-score (micro avg) 0.9788
2023-07-26 19:00:48,869 ----------------------------------------------------------------------------------------------------
2023-07-26 19:01:33,809 epoch 37 - iter 24/243 - loss 0.27190881 - time (sec): 44.94 - samples/sec: 183.74 - lr: 0.000016
2023-07-26 19:02:18,402 epoch 37 - iter 48/243 - loss 0.26681536 - time (sec): 89.53 - samples/sec: 178.86 - lr: 0.000015
2023-07-26 19:03:03,153 epoch 37 - iter 72/243 - loss 0.26204165 - time (sec): 134.28 - samples/sec: 177.43 - lr: 0.000015
2023-07-26 19:03:47,816 epoch 37 - iter 96/243 - loss 0.25844813 - time (sec): 178.95 - samples/sec: 176.20 - lr: 0.000015
2023-07-26 19:04:32,391 epoch 37 - iter 120/243 - loss 0.25889938 - time (sec): 223.52 - samples/sec: 174.82 - lr: 0.000015
2023-07-26 19:05:17,029 epoch 37 - iter 144/243 - loss 0.26222809 - time (sec): 268.16 - samples/sec: 175.18 - lr: 0.000015
2023-07-26 19:06:01,650 epoch 37 - iter 168/243 - loss 0.26407155 - time (sec): 312.78 - samples/sec: 174.91 - lr: 0.000015
2023-07-26 19:06:46,300 epoch 37 - iter 192/243 - loss 0.26361155 - time (sec): 357.43 - samples/sec: 174.83 - lr: 0.000015
2023-07-26 19:07:31,061 epoch 37 - iter 216/243 - loss 0.26668156 - time (sec): 402.19 - samples/sec: 174.78 - lr: 0.000015
2023-07-26 19:08:15,436 epoch 37 - iter 240/243 - loss 0.26504239 - time (sec): 446.57 - samples/sec: 174.15 - lr: 0.000015
2023-07-26 19:08:20,495 ----------------------------------------------------------------------------------------------------
2023-07-26 19:08:20,495 EPOCH 37 done: loss 0.2650 - lr 0.000015
2023-07-26 19:08:22,330 Evaluating as a multi-label problem: False
2023-07-26 19:08:22,374 DEV : loss 0.2624962031841278 - f1-score (micro avg) 0.9781
2023-07-26 19:08:22,384 ----------------------------------------------------------------------------------------------------
2023-07-26 19:09:06,629 epoch 38 - iter 24/243 - loss 0.26162759 - time (sec): 44.24 - samples/sec: 174.37 - lr: 0.000014
2023-07-26 19:09:51,176 epoch 38 - iter 48/243 - loss 0.26085357 - time (sec): 88.79 - samples/sec: 175.87 - lr: 0.000014
2023-07-26 19:10:35,702 epoch 38 - iter 72/243 - loss 0.25308808 - time (sec): 133.32 - samples/sec: 176.61 - lr: 0.000014
2023-07-26 19:11:19,948 epoch 38 - iter 96/243 - loss 0.25632516 - time (sec): 177.56 - samples/sec: 175.93 - lr: 0.000014
2023-07-26 19:12:04,580 epoch 38 - iter 120/243 - loss 0.25358337 - time (sec): 222.20 - samples/sec: 176.70 - lr: 0.000014
2023-07-26 19:12:48,992 epoch 38 - iter 144/243 - loss 0.25557088 - time (sec): 266.61 - samples/sec: 176.51 - lr: 0.000014
2023-07-26 19:13:33,435 epoch 38 - iter 168/243 - loss 0.25407854 - time (sec): 311.05 - samples/sec: 176.83 - lr: 0.000014
2023-07-26 19:14:17,541 epoch 38 - iter 192/243 - loss 0.25597339 - time (sec): 355.16 - samples/sec: 176.02 - lr: 0.000014
2023-07-26 19:15:01,826 epoch 38 - iter 216/243 - loss 0.25532730 - time (sec): 399.44 - samples/sec: 175.68 - lr: 0.000014
2023-07-26 19:15:45,905 epoch 38 - iter 240/243 - loss 0.25415245 - time (sec): 443.52 - samples/sec: 175.02 - lr: 0.000013
2023-07-26 19:15:51,052 ----------------------------------------------------------------------------------------------------
2023-07-26 19:15:51,053 EPOCH 38 done: loss 0.2542 - lr 0.000013
2023-07-26 19:15:52,801 Evaluating as a multi-label problem: False
2023-07-26 19:15:52,845 DEV : loss 0.24244999885559082 - f1-score (micro avg) 0.9788
2023-07-26 19:15:52,855 ----------------------------------------------------------------------------------------------------
2023-07-26 19:16:37,293 epoch 39 - iter 24/243 - loss 0.25336484 - time (sec): 44.44 - samples/sec: 176.50 - lr: 0.000013
2023-07-26 19:17:21,644 epoch 39 - iter 48/243 - loss 0.25897743 - time (sec): 88.79 - samples/sec: 177.38 - lr: 0.000013
2023-07-26 19:18:05,772 epoch 39 - iter 72/243 - loss 0.25769549 - time (sec): 132.92 - samples/sec: 175.31 - lr: 0.000013
2023-07-26 19:18:50,169 epoch 39 - iter 96/243 - loss 0.25751150 - time (sec): 177.31 - samples/sec: 175.93 - lr: 0.000013
2023-07-26 19:19:34,381 epoch 39 - iter 120/243 - loss 0.25315782 - time (sec): 221.53 - samples/sec: 175.24 - lr: 0.000013
2023-07-26 19:20:18,559 epoch 39 - iter 144/243 - loss 0.25233489 - time (sec): 265.70 - samples/sec: 174.74 - lr: 0.000013
2023-07-26 19:21:03,145 epoch 39 - iter 168/243 - loss 0.25114668 - time (sec): 310.29 - samples/sec: 174.33 - lr: 0.000013
2023-07-26 19:21:47,854 epoch 39 - iter 192/243 - loss 0.25185953 - time (sec): 355.00 - samples/sec: 174.12 - lr: 0.000013
2023-07-26 19:22:32,507 epoch 39 - iter 216/243 - loss 0.25746349 - time (sec): 399.65 - samples/sec: 174.90 - lr: 0.000012
2023-07-26 19:23:16,796 epoch 39 - iter 240/243 - loss 0.25680252 - time (sec): 443.94 - samples/sec: 174.98 - lr: 0.000012
2023-07-26 19:23:21,907 ----------------------------------------------------------------------------------------------------
2023-07-26 19:23:21,908 EPOCH 39 done: loss 0.2579 - lr 0.000012
2023-07-26 19:23:23,678 Evaluating as a multi-label problem: False
2023-07-26 19:23:23,719 DEV : loss 0.24615894258022308 - f1-score (micro avg) 0.9798
2023-07-26 19:23:23,729 ----------------------------------------------------------------------------------------------------
2023-07-26 19:24:08,073 epoch 40 - iter 24/243 - loss 0.24837758 - time (sec): 44.34 - samples/sec: 175.24 - lr: 0.000012
2023-07-26 19:24:52,448 epoch 40 - iter 48/243 - loss 0.24725040 - time (sec): 88.72 - samples/sec: 176.39 - lr: 0.000012
2023-07-26 19:25:37,011 epoch 40 - iter 72/243 - loss 0.25023824 - time (sec): 133.28 - samples/sec: 176.92 - lr: 0.000012
2023-07-26 19:26:21,296 epoch 40 - iter 96/243 - loss 0.24239002 - time (sec): 177.57 - samples/sec: 176.32 - lr: 0.000012
2023-07-26 19:27:05,481 epoch 40 - iter 120/243 - loss 0.24524267 - time (sec): 221.75 - samples/sec: 175.34 - lr: 0.000012
2023-07-26 19:27:49,791 epoch 40 - iter 144/243 - loss 0.24784591 - time (sec): 266.06 - samples/sec: 175.50 - lr: 0.000012
2023-07-26 19:28:34,155 epoch 40 - iter 168/243 - loss 0.24872740 - time (sec): 310.43 - samples/sec: 174.67 - lr: 0.000012
2023-07-26 19:29:18,697 epoch 40 - iter 192/243 - loss 0.25012412 - time (sec): 354.97 - samples/sec: 174.67 - lr: 0.000011
2023-07-26 19:30:03,191 epoch 40 - iter 216/243 - loss 0.25345259 - time (sec): 399.46 - samples/sec: 174.99 - lr: 0.000011
2023-07-26 19:30:47,560 epoch 40 - iter 240/243 - loss 0.25383699 - time (sec): 443.83 - samples/sec: 174.98 - lr: 0.000011
2023-07-26 19:30:52,654 ----------------------------------------------------------------------------------------------------
2023-07-26 19:30:52,655 EPOCH 40 done: loss 0.2540 - lr 0.000011
2023-07-26 19:30:54,396 Evaluating as a multi-label problem: False
2023-07-26 19:30:54,438 DEV : loss 0.2575598359107971 - f1-score (micro avg) 0.9791
2023-07-26 19:30:54,447 ----------------------------------------------------------------------------------------------------
2023-07-26 19:31:38,768 epoch 41 - iter 24/243 - loss 0.24306327 - time (sec): 44.32 - samples/sec: 175.47 - lr: 0.000011
2023-07-26 19:32:23,092 epoch 41 - iter 48/243 - loss 0.24156726 - time (sec): 88.64 - samples/sec: 175.50 - lr: 0.000011
2023-07-26 19:33:07,508 epoch 41 - iter 72/243 - loss 0.24869032 - time (sec): 133.06 - samples/sec: 177.14 - lr: 0.000011
2023-07-26 19:33:51,664 epoch 41 - iter 96/243 - loss 0.25072177 - time (sec): 177.22 - samples/sec: 175.38 - lr: 0.000011
2023-07-26 19:34:35,771 epoch 41 - iter 120/243 - loss 0.25396376 - time (sec): 221.32 - samples/sec: 174.35 - lr: 0.000011
2023-07-26 19:35:20,071 epoch 41 - iter 144/243 - loss 0.25095812 - time (sec): 265.62 - samples/sec: 174.83 - lr: 0.000011
2023-07-26 19:36:04,548 epoch 41 - iter 168/243 - loss 0.24810464 - time (sec): 310.10 - samples/sec: 175.56 - lr: 0.000010
2023-07-26 19:36:48,812 epoch 41 - iter 192/243 - loss 0.24879453 - time (sec): 354.36 - samples/sec: 175.37 - lr: 0.000010
2023-07-26 19:37:33,241 epoch 41 - iter 216/243 - loss 0.25177431 - time (sec): 398.79 - samples/sec: 175.79 - lr: 0.000010
2023-07-26 19:38:17,405 epoch 41 - iter 240/243 - loss 0.25152758 - time (sec): 442.96 - samples/sec: 175.50 - lr: 0.000010
2023-07-26 19:38:22,468 ----------------------------------------------------------------------------------------------------
2023-07-26 19:38:22,469 EPOCH 41 done: loss 0.2509 - lr 0.000010
2023-07-26 19:38:24,215 Evaluating as a multi-label problem: False
2023-07-26 19:38:24,257 DEV : loss 0.25127604603767395 - f1-score (micro avg) 0.9786
2023-07-26 19:38:24,267 ----------------------------------------------------------------------------------------------------
2023-07-26 19:39:08,271 epoch 42 - iter 24/243 - loss 0.25413425 - time (sec): 44.00 - samples/sec: 167.14 - lr: 0.000010
2023-07-26 19:39:52,711 epoch 42 - iter 48/243 - loss 0.25771203 - time (sec): 88.44 - samples/sec: 173.59 - lr: 0.000010
2023-07-26 19:40:37,013 epoch 42 - iter 72/243 - loss 0.25402986 - time (sec): 132.75 - samples/sec: 174.07 - lr: 0.000010
2023-07-26 19:41:21,464 epoch 42 - iter 96/243 - loss 0.25689370 - time (sec): 177.20 - samples/sec: 175.64 - lr: 0.000010
2023-07-26 19:42:05,507 epoch 42 - iter 120/243 - loss 0.25635789 - time (sec): 221.24 - samples/sec: 174.08 - lr: 0.000010
2023-07-26 19:42:49,881 epoch 42 - iter 144/243 - loss 0.25641142 - time (sec): 265.61 - samples/sec: 174.68 - lr: 0.000009
2023-07-26 19:43:34,200 epoch 42 - iter 168/243 - loss 0.25676110 - time (sec): 309.93 - samples/sec: 175.15 - lr: 0.000009
2023-07-26 19:44:18,472 epoch 42 - iter 192/243 - loss 0.25789268 - time (sec): 354.20 - samples/sec: 175.15 - lr: 0.000009
2023-07-26 19:45:02,833 epoch 42 - iter 216/243 - loss 0.25889165 - time (sec): 398.57 - samples/sec: 175.63 - lr: 0.000009
2023-07-26 19:45:47,116 epoch 42 - iter 240/243 - loss 0.25885055 - time (sec): 442.85 - samples/sec: 175.64 - lr: 0.000009
2023-07-26 19:45:52,133 ----------------------------------------------------------------------------------------------------
2023-07-26 19:45:52,133 EPOCH 42 done: loss 0.2584 - lr 0.000009
2023-07-26 19:45:54,001 Evaluating as a multi-label problem: False
2023-07-26 19:45:54,045 DEV : loss 0.2509002983570099 - f1-score (micro avg) 0.9776
2023-07-26 19:45:54,056 ----------------------------------------------------------------------------------------------------
2023-07-26 19:46:38,776 epoch 43 - iter 24/243 - loss 0.25656669 - time (sec): 44.72 - samples/sec: 175.88 - lr: 0.000009
2023-07-26 19:47:23,646 epoch 43 - iter 48/243 - loss 0.25713909 - time (sec): 89.59 - samples/sec: 179.17 - lr: 0.000009
2023-07-26 19:48:08,109 epoch 43 - iter 72/243 - loss 0.25209780 - time (sec): 134.05 - samples/sec: 176.45 - lr: 0.000009
2023-07-26 19:48:52,698 epoch 43 - iter 96/243 - loss 0.24509857 - time (sec): 178.64 - samples/sec: 175.75 - lr: 0.000009
2023-07-26 19:49:37,182 epoch 43 - iter 120/243 - loss 0.25000579 - time (sec): 223.13 - samples/sec: 174.94 - lr: 0.000008
2023-07-26 19:50:21,736 epoch 43 - iter 144/243 - loss 0.25295949 - time (sec): 267.68 - samples/sec: 175.08 - lr: 0.000008
2023-07-26 19:51:06,420 epoch 43 - iter 168/243 - loss 0.25493036 - time (sec): 312.36 - samples/sec: 175.74 - lr: 0.000008
2023-07-26 19:51:50,778 epoch 43 - iter 192/243 - loss 0.25313033 - time (sec): 356.72 - samples/sec: 174.71 - lr: 0.000008
2023-07-26 19:52:35,121 epoch 43 - iter 216/243 - loss 0.25255837 - time (sec): 401.06 - samples/sec: 174.20 - lr: 0.000008
2023-07-26 19:53:19,699 epoch 43 - iter 240/243 - loss 0.25326105 - time (sec): 445.64 - samples/sec: 174.52 - lr: 0.000008
2023-07-26 19:53:24,805 ----------------------------------------------------------------------------------------------------
2023-07-26 19:53:24,805 EPOCH 43 done: loss 0.2536 - lr 0.000008
2023-07-26 19:53:27,059 Evaluating as a multi-label problem: False
2023-07-26 19:53:27,103 DEV : loss 0.25337928533554077 - f1-score (micro avg) 0.9784
2023-07-26 19:53:27,114 ----------------------------------------------------------------------------------------------------
2023-07-26 19:54:11,472 epoch 44 - iter 24/243 - loss 0.22752064 - time (sec): 44.36 - samples/sec: 169.73 - lr: 0.000008
2023-07-26 19:54:55,919 epoch 44 - iter 48/243 - loss 0.23951614 - time (sec): 88.80 - samples/sec: 171.20 - lr: 0.000008
2023-07-26 19:55:40,452 epoch 44 - iter 72/243 - loss 0.23986022 - time (sec): 133.34 - samples/sec: 171.97 - lr: 0.000008
2023-07-26 19:56:25,023 epoch 44 - iter 96/243 - loss 0.24528781 - time (sec): 177.91 - samples/sec: 173.40 - lr: 0.000007
2023-07-26 19:57:09,511 epoch 44 - iter 120/243 - loss 0.24572088 - time (sec): 222.40 - samples/sec: 173.40 - lr: 0.000007
2023-07-26 19:57:54,163 epoch 44 - iter 144/243 - loss 0.24464183 - time (sec): 267.05 - samples/sec: 173.04 - lr: 0.000007
2023-07-26 19:58:39,149 epoch 44 - iter 168/243 - loss 0.24523592 - time (sec): 312.04 - samples/sec: 173.72 - lr: 0.000007
2023-07-26 19:59:23,881 epoch 44 - iter 192/243 - loss 0.24519757 - time (sec): 356.77 - samples/sec: 173.61 - lr: 0.000007
2023-07-26 20:00:08,665 epoch 44 - iter 216/243 - loss 0.24456227 - time (sec): 401.55 - samples/sec: 173.99 - lr: 0.000007
2023-07-26 20:00:53,278 epoch 44 - iter 240/243 - loss 0.24582873 - time (sec): 446.16 - samples/sec: 174.08 - lr: 0.000007
2023-07-26 20:00:58,393 ----------------------------------------------------------------------------------------------------
2023-07-26 20:00:58,393 EPOCH 44 done: loss 0.2462 - lr 0.000007
2023-07-26 20:01:00,158 Evaluating as a multi-label problem: False
2023-07-26 20:01:00,200 DEV : loss 0.25915977358818054 - f1-score (micro avg) 0.9784
2023-07-26 20:01:00,210 ----------------------------------------------------------------------------------------------------
2023-07-26 20:01:44,820 epoch 45 - iter 24/243 - loss 0.26201019 - time (sec): 44.61 - samples/sec: 176.98 - lr: 0.000007
2023-07-26 20:02:29,464 epoch 45 - iter 48/243 - loss 0.24779270 - time (sec): 89.25 - samples/sec: 174.35 - lr: 0.000007
2023-07-26 20:03:13,973 epoch 45 - iter 72/243 - loss 0.25012887 - time (sec): 133.76 - samples/sec: 174.72 - lr: 0.000006
2023-07-26 20:03:58,625 epoch 45 - iter 96/243 - loss 0.25289868 - time (sec): 178.41 - samples/sec: 174.60 - lr: 0.000006
2023-07-26 20:04:43,139 epoch 45 - iter 120/243 - loss 0.25326284 - time (sec): 222.93 - samples/sec: 174.12 - lr: 0.000006
2023-07-26 20:05:27,809 epoch 45 - iter 144/243 - loss 0.25373868 - time (sec): 267.60 - samples/sec: 174.76 - lr: 0.000006
2023-07-26 20:06:12,288 epoch 45 - iter 168/243 - loss 0.25215421 - time (sec): 312.08 - samples/sec: 174.53 - lr: 0.000006
2023-07-26 20:06:56,723 epoch 45 - iter 192/243 - loss 0.25175489 - time (sec): 356.51 - samples/sec: 174.02 - lr: 0.000006
2023-07-26 20:07:41,287 epoch 45 - iter 216/243 - loss 0.24952171 - time (sec): 401.08 - samples/sec: 174.05 - lr: 0.000006
2023-07-26 20:08:25,996 epoch 45 - iter 240/243 - loss 0.25004168 - time (sec): 445.79 - samples/sec: 174.41 - lr: 0.000006
2023-07-26 20:08:31,078 ----------------------------------------------------------------------------------------------------
2023-07-26 20:08:31,079 EPOCH 45 done: loss 0.2503 - lr 0.000006
2023-07-26 20:08:32,834 Evaluating as a multi-label problem: False
2023-07-26 20:08:32,877 DEV : loss 0.2550533413887024 - f1-score (micro avg) 0.9788
2023-07-26 20:08:32,887 ----------------------------------------------------------------------------------------------------
2023-07-26 20:09:17,479 epoch 46 - iter 24/243 - loss 0.24479678 - time (sec): 44.59 - samples/sec: 177.79 - lr: 0.000006
2023-07-26 20:10:02,067 epoch 46 - iter 48/243 - loss 0.24138586 - time (sec): 89.18 - samples/sec: 175.65 - lr: 0.000005
2023-07-26 20:10:46,638 epoch 46 - iter 72/243 - loss 0.24404064 - time (sec): 133.75 - samples/sec: 175.18 - lr: 0.000005
2023-07-26 20:11:31,127 epoch 46 - iter 96/243 - loss 0.24604064 - time (sec): 178.24 - samples/sec: 174.01 - lr: 0.000005
2023-07-26 20:12:15,792 epoch 46 - iter 120/243 - loss 0.24783294 - time (sec): 222.91 - samples/sec: 174.51 - lr: 0.000005
2023-07-26 20:13:00,505 epoch 46 - iter 144/243 - loss 0.24973562 - time (sec): 267.62 - samples/sec: 174.34 - lr: 0.000005
2023-07-26 20:13:45,181 epoch 46 - iter 168/243 - loss 0.24967162 - time (sec): 312.29 - samples/sec: 173.97 - lr: 0.000005
2023-07-26 20:14:30,156 epoch 46 - iter 192/243 - loss 0.25131667 - time (sec): 357.27 - samples/sec: 173.94 - lr: 0.000005
2023-07-26 20:15:14,977 epoch 46 - iter 216/243 - loss 0.25004815 - time (sec): 402.09 - samples/sec: 174.06 - lr: 0.000005
2023-07-26 20:15:59,586 epoch 46 - iter 240/243 - loss 0.24797003 - time (sec): 446.70 - samples/sec: 174.19 - lr: 0.000005
2023-07-26 20:16:04,601 ----------------------------------------------------------------------------------------------------
2023-07-26 20:16:04,602 EPOCH 46 done: loss 0.2475 - lr 0.000005
2023-07-26 20:16:06,359 Evaluating as a multi-label problem: False
2023-07-26 20:16:06,401 DEV : loss 0.2502936124801636 - f1-score (micro avg) 0.9796
2023-07-26 20:16:06,411 ----------------------------------------------------------------------------------------------------
2023-07-26 20:16:50,970 epoch 47 - iter 24/243 - loss 0.24652539 - time (sec): 44.56 - samples/sec: 177.11 - lr: 0.000004
2023-07-26 20:17:35,687 epoch 47 - iter 48/243 - loss 0.25432254 - time (sec): 89.28 - samples/sec: 178.43 - lr: 0.000004
2023-07-26 20:18:20,313 epoch 47 - iter 72/243 - loss 0.24907829 - time (sec): 133.90 - samples/sec: 178.67 - lr: 0.000004
2023-07-26 20:19:04,573 epoch 47 - iter 96/243 - loss 0.25143514 - time (sec): 178.16 - samples/sec: 175.41 - lr: 0.000004
2023-07-26 20:19:49,067 epoch 47 - iter 120/243 - loss 0.25195942 - time (sec): 222.66 - samples/sec: 174.82 - lr: 0.000004
2023-07-26 20:20:33,729 epoch 47 - iter 144/243 - loss 0.25140692 - time (sec): 267.32 - samples/sec: 175.12 - lr: 0.000004
2023-07-26 20:21:18,294 epoch 47 - iter 168/243 - loss 0.25098133 - time (sec): 311.88 - samples/sec: 175.27 - lr: 0.000004
2023-07-26 20:22:02,731 epoch 47 - iter 192/243 - loss 0.24903435 - time (sec): 356.32 - samples/sec: 174.38 - lr: 0.000004
2023-07-26 20:22:47,241 epoch 47 - iter 216/243 - loss 0.24707558 - time (sec): 400.83 - samples/sec: 174.35 - lr: 0.000004
2023-07-26 20:23:31,808 epoch 47 - iter 240/243 - loss 0.24996260 - time (sec): 445.40 - samples/sec: 174.50 - lr: 0.000003
2023-07-26 20:23:36,885 ----------------------------------------------------------------------------------------------------
2023-07-26 20:23:36,885 EPOCH 47 done: loss 0.2500 - lr 0.000003
2023-07-26 20:23:38,718 Evaluating as a multi-label problem: False
2023-07-26 20:23:38,760 DEV : loss 0.25260353088378906 - f1-score (micro avg) 0.9788
2023-07-26 20:23:38,770 ----------------------------------------------------------------------------------------------------
2023-07-26 20:24:23,284 epoch 48 - iter 24/243 - loss 0.26092477 - time (sec): 44.51 - samples/sec: 173.72 - lr: 0.000003
2023-07-26 20:25:07,731 epoch 48 - iter 48/243 - loss 0.26380496 - time (sec): 88.96 - samples/sec: 172.51 - lr: 0.000003
2023-07-26 20:25:52,549 epoch 48 - iter 72/243 - loss 0.26586966 - time (sec): 133.78 - samples/sec: 175.68 - lr: 0.000003
2023-07-26 20:26:37,081 epoch 48 - iter 96/243 - loss 0.26118560 - time (sec): 178.31 - samples/sec: 175.37 - lr: 0.000003
2023-07-26 20:27:21,769 epoch 48 - iter 120/243 - loss 0.25715945 - time (sec): 223.00 - samples/sec: 176.11 - lr: 0.000003
2023-07-26 20:28:06,589 epoch 48 - iter 144/243 - loss 0.25935501 - time (sec): 267.82 - samples/sec: 176.32 - lr: 0.000003
2023-07-26 20:28:51,230 epoch 48 - iter 168/243 - loss 0.25807126 - time (sec): 312.46 - samples/sec: 175.36 - lr: 0.000003
2023-07-26 20:29:35,872 epoch 48 - iter 192/243 - loss 0.25819322 - time (sec): 357.10 - samples/sec: 174.73 - lr: 0.000003
2023-07-26 20:30:20,621 epoch 48 - iter 216/243 - loss 0.25780077 - time (sec): 401.85 - samples/sec: 174.84 - lr: 0.000002
2023-07-26 20:31:05,115 epoch 48 - iter 240/243 - loss 0.25669533 - time (sec): 446.34 - samples/sec: 174.17 - lr: 0.000002
2023-07-26 20:31:10,189 ----------------------------------------------------------------------------------------------------
2023-07-26 20:31:10,189 EPOCH 48 done: loss 0.2562 - lr 0.000002
2023-07-26 20:31:11,946 Evaluating as a multi-label problem: False
2023-07-26 20:31:11,989 DEV : loss 0.2517630159854889 - f1-score (micro avg) 0.9793
2023-07-26 20:31:11,998 ----------------------------------------------------------------------------------------------------
2023-07-26 20:31:56,576 epoch 49 - iter 24/243 - loss 0.27952006 - time (sec): 44.58 - samples/sec: 171.79 - lr: 0.000002
2023-07-26 20:32:41,285 epoch 49 - iter 48/243 - loss 0.26483505 - time (sec): 89.29 - samples/sec: 172.32 - lr: 0.000002
2023-07-26 20:33:25,782 epoch 49 - iter 72/243 - loss 0.25971199 - time (sec): 133.78 - samples/sec: 171.90 - lr: 0.000002
2023-07-26 20:34:10,460 epoch 49 - iter 96/243 - loss 0.25971123 - time (sec): 178.46 - samples/sec: 173.31 - lr: 0.000002
2023-07-26 20:34:55,145 epoch 49 - iter 120/243 - loss 0.25121870 - time (sec): 223.15 - samples/sec: 174.45 - lr: 0.000002
2023-07-26 20:35:39,794 epoch 49 - iter 144/243 - loss 0.24985456 - time (sec): 267.80 - samples/sec: 174.14 - lr: 0.000002
2023-07-26 20:36:24,454 epoch 49 - iter 168/243 - loss 0.25019492 - time (sec): 312.46 - samples/sec: 173.74 - lr: 0.000002
2023-07-26 20:37:09,180 epoch 49 - iter 192/243 - loss 0.24964407 - time (sec): 357.18 - samples/sec: 174.05 - lr: 0.000001
2023-07-26 20:37:53,667 epoch 49 - iter 216/243 - loss 0.24966262 - time (sec): 401.67 - samples/sec: 173.91 - lr: 0.000001
2023-07-26 20:38:38,222 epoch 49 - iter 240/243 - loss 0.24839303 - time (sec): 446.22 - samples/sec: 173.82 - lr: 0.000001
2023-07-26 20:38:43,407 ----------------------------------------------------------------------------------------------------
2023-07-26 20:38:43,407 EPOCH 49 done: loss 0.2480 - lr 0.000001
2023-07-26 20:38:45,164 Evaluating as a multi-label problem: False
2023-07-26 20:38:45,206 DEV : loss 0.25181668996810913 - f1-score (micro avg) 0.9786
2023-07-26 20:38:45,216 ----------------------------------------------------------------------------------------------------
2023-07-26 20:39:30,103 epoch 50 - iter 24/243 - loss 0.26114983 - time (sec): 44.89 - samples/sec: 184.97 - lr: 0.000001
2023-07-26 20:40:14,469 epoch 50 - iter 48/243 - loss 0.24629344 - time (sec): 89.25 - samples/sec: 177.23 - lr: 0.000001
2023-07-26 20:40:58,962 epoch 50 - iter 72/243 - loss 0.24771674 - time (sec): 133.75 - samples/sec: 176.12 - lr: 0.000001
2023-07-26 20:41:43,633 epoch 50 - iter 96/243 - loss 0.24705085 - time (sec): 178.42 - samples/sec: 176.67 - lr: 0.000001
2023-07-26 20:42:28,058 epoch 50 - iter 120/243 - loss 0.24435267 - time (sec): 222.84 - samples/sec: 175.63 - lr: 0.000001
2023-07-26 20:43:12,552 epoch 50 - iter 144/243 - loss 0.24537610 - time (sec): 267.34 - samples/sec: 175.26 - lr: 0.000001
2023-07-26 20:43:57,183 epoch 50 - iter 168/243 - loss 0.24725247 - time (sec): 311.97 - samples/sec: 175.35 - lr: 0.000000
2023-07-26 20:44:42,166 epoch 50 - iter 192/243 - loss 0.24773009 - time (sec): 356.95 - samples/sec: 174.58 - lr: 0.000000
2023-07-26 20:45:27,096 epoch 50 - iter 216/243 - loss 0.24906212 - time (sec): 401.88 - samples/sec: 173.96 - lr: 0.000000
2023-07-26 20:46:12,548 epoch 50 - iter 240/243 - loss 0.24977353 - time (sec): 447.33 - samples/sec: 173.87 - lr: 0.000000
2023-07-26 20:46:17,709 ----------------------------------------------------------------------------------------------------
2023-07-26 20:46:17,709 EPOCH 50 done: loss 0.2503 - lr 0.000000
2023-07-26 20:46:19,451 Evaluating as a multi-label problem: False
2023-07-26 20:46:19,493 DEV : loss 0.2513697147369385 - f1-score (micro avg) 0.9784
2023-07-26 20:46:22,002 Test data not provided setting final score to 0
|