2023-10-25 00:48:38,812 ---------------------------------------------------------------------------------------------------- 2023-10-25 00:48:38,812 Model: "SequenceTagger( (embeddings): TransformerWordEmbeddings( (model): BertModel( (embeddings): BertEmbeddings( (word_embeddings): Embedding(64001, 768) (position_embeddings): Embedding(512, 768) (token_type_embeddings): Embedding(2, 768) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): BertEncoder( (layer): ModuleList( (0): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (1): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (2): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (3): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (4): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (5): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (6): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (7): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (8): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (9): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (10): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (11): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) (pooler): BertPooler( (dense): Linear(in_features=768, out_features=768, bias=True) (activation): Tanh() ) ) ) (locked_dropout): LockedDropout(p=0.5) (linear): Linear(in_features=768, out_features=13, bias=True) (loss_function): CrossEntropyLoss() )" 2023-10-25 00:48:38,813 ---------------------------------------------------------------------------------------------------- 2023-10-25 00:48:38,813 MultiCorpus: 5777 train + 722 dev + 723 test sentences - NER_ICDAR_EUROPEANA Corpus: 5777 train + 722 dev + 723 test sentences - /home/ubuntu/.flair/datasets/ner_icdar_europeana/nl 2023-10-25 00:48:38,813 ---------------------------------------------------------------------------------------------------- 2023-10-25 00:48:38,813 Train: 5777 sentences 2023-10-25 00:48:38,813 (train_with_dev=False, train_with_test=False) 2023-10-25 00:48:38,813 ---------------------------------------------------------------------------------------------------- 2023-10-25 00:48:38,813 Training Params: 2023-10-25 00:48:38,813 - learning_rate: "3e-05" 2023-10-25 00:48:38,813 - mini_batch_size: "8" 2023-10-25 00:48:38,813 - max_epochs: "10" 2023-10-25 00:48:38,813 - shuffle: "True" 2023-10-25 00:48:38,813 ---------------------------------------------------------------------------------------------------- 2023-10-25 00:48:38,813 Plugins: 2023-10-25 00:48:38,813 - TensorboardLogger 2023-10-25 00:48:38,813 - LinearScheduler | warmup_fraction: '0.1' 2023-10-25 00:48:38,813 ---------------------------------------------------------------------------------------------------- 2023-10-25 00:48:38,813 Final evaluation on model from best epoch (best-model.pt) 2023-10-25 00:48:38,813 - metric: "('micro avg', 'f1-score')" 2023-10-25 00:48:38,813 ---------------------------------------------------------------------------------------------------- 2023-10-25 00:48:38,813 Computation: 2023-10-25 00:48:38,813 - compute on device: cuda:0 2023-10-25 00:48:38,813 - embedding storage: none 2023-10-25 00:48:38,813 ---------------------------------------------------------------------------------------------------- 2023-10-25 00:48:38,813 Model training base path: "hmbench-icdar/nl-dbmdz/bert-base-historic-multilingual-64k-td-cased-bs8-wsFalse-e10-lr3e-05-poolingfirst-layers-1-crfFalse-3" 2023-10-25 00:48:38,814 ---------------------------------------------------------------------------------------------------- 2023-10-25 00:48:38,814 ---------------------------------------------------------------------------------------------------- 2023-10-25 00:48:38,814 Logging anything other than scalars to TensorBoard is currently not supported. 2023-10-25 00:48:47,882 epoch 1 - iter 72/723 - loss 1.72003712 - time (sec): 9.07 - samples/sec: 2031.45 - lr: 0.000003 - momentum: 0.000000 2023-10-25 00:48:55,830 epoch 1 - iter 144/723 - loss 1.06884587 - time (sec): 17.02 - samples/sec: 2033.04 - lr: 0.000006 - momentum: 0.000000 2023-10-25 00:49:04,323 epoch 1 - iter 216/723 - loss 0.79665933 - time (sec): 25.51 - samples/sec: 2032.61 - lr: 0.000009 - momentum: 0.000000 2023-10-25 00:49:12,388 epoch 1 - iter 288/723 - loss 0.64604363 - time (sec): 33.57 - samples/sec: 2052.03 - lr: 0.000012 - momentum: 0.000000 2023-10-25 00:49:21,743 epoch 1 - iter 360/723 - loss 0.53992684 - time (sec): 42.93 - samples/sec: 2043.60 - lr: 0.000015 - momentum: 0.000000 2023-10-25 00:49:29,969 epoch 1 - iter 432/723 - loss 0.47948810 - time (sec): 51.15 - samples/sec: 2042.47 - lr: 0.000018 - momentum: 0.000000 2023-10-25 00:49:38,712 epoch 1 - iter 504/723 - loss 0.42749288 - time (sec): 59.90 - samples/sec: 2044.18 - lr: 0.000021 - momentum: 0.000000 2023-10-25 00:49:47,173 epoch 1 - iter 576/723 - loss 0.39191852 - time (sec): 68.36 - samples/sec: 2050.24 - lr: 0.000024 - momentum: 0.000000 2023-10-25 00:49:55,886 epoch 1 - iter 648/723 - loss 0.36314858 - time (sec): 77.07 - samples/sec: 2049.30 - lr: 0.000027 - momentum: 0.000000 2023-10-25 00:50:04,370 epoch 1 - iter 720/723 - loss 0.34146720 - time (sec): 85.56 - samples/sec: 2051.90 - lr: 0.000030 - momentum: 0.000000 2023-10-25 00:50:04,766 ---------------------------------------------------------------------------------------------------- 2023-10-25 00:50:04,766 EPOCH 1 done: loss 0.3406 - lr: 0.000030 2023-10-25 00:50:08,076 DEV : loss 0.10530021041631699 - f1-score (micro avg) 0.7179 2023-10-25 00:50:08,088 saving best model 2023-10-25 00:50:08,645 ---------------------------------------------------------------------------------------------------- 2023-10-25 00:50:17,227 epoch 2 - iter 72/723 - loss 0.11020220 - time (sec): 8.58 - samples/sec: 2017.49 - lr: 0.000030 - momentum: 0.000000 2023-10-25 00:50:26,179 epoch 2 - iter 144/723 - loss 0.11776963 - time (sec): 17.53 - samples/sec: 2039.48 - lr: 0.000029 - momentum: 0.000000 2023-10-25 00:50:35,131 epoch 2 - iter 216/723 - loss 0.11133959 - time (sec): 26.48 - samples/sec: 2039.30 - lr: 0.000029 - momentum: 0.000000 2023-10-25 00:50:44,338 epoch 2 - iter 288/723 - loss 0.10679271 - time (sec): 35.69 - samples/sec: 2022.90 - lr: 0.000029 - momentum: 0.000000 2023-10-25 00:50:53,030 epoch 2 - iter 360/723 - loss 0.10466024 - time (sec): 44.38 - samples/sec: 2011.78 - lr: 0.000028 - momentum: 0.000000 2023-10-25 00:51:01,657 epoch 2 - iter 432/723 - loss 0.10254726 - time (sec): 53.01 - samples/sec: 2027.29 - lr: 0.000028 - momentum: 0.000000 2023-10-25 00:51:10,124 epoch 2 - iter 504/723 - loss 0.10045793 - time (sec): 61.48 - samples/sec: 2015.95 - lr: 0.000028 - momentum: 0.000000 2023-10-25 00:51:18,268 epoch 2 - iter 576/723 - loss 0.09869595 - time (sec): 69.62 - samples/sec: 2025.86 - lr: 0.000027 - momentum: 0.000000 2023-10-25 00:51:26,720 epoch 2 - iter 648/723 - loss 0.09948556 - time (sec): 78.07 - samples/sec: 2025.43 - lr: 0.000027 - momentum: 0.000000 2023-10-25 00:51:35,278 epoch 2 - iter 720/723 - loss 0.09645545 - time (sec): 86.63 - samples/sec: 2026.75 - lr: 0.000027 - momentum: 0.000000 2023-10-25 00:51:35,717 ---------------------------------------------------------------------------------------------------- 2023-10-25 00:51:35,717 EPOCH 2 done: loss 0.0963 - lr: 0.000027 2023-10-25 00:51:39,428 DEV : loss 0.08788451552391052 - f1-score (micro avg) 0.8115 2023-10-25 00:51:39,439 saving best model 2023-10-25 00:51:40,154 ---------------------------------------------------------------------------------------------------- 2023-10-25 00:51:48,954 epoch 3 - iter 72/723 - loss 0.07481811 - time (sec): 8.80 - samples/sec: 1954.72 - lr: 0.000026 - momentum: 0.000000 2023-10-25 00:51:57,542 epoch 3 - iter 144/723 - loss 0.06232580 - time (sec): 17.39 - samples/sec: 2021.16 - lr: 0.000026 - momentum: 0.000000 2023-10-25 00:52:05,977 epoch 3 - iter 216/723 - loss 0.06302027 - time (sec): 25.82 - samples/sec: 2034.40 - lr: 0.000026 - momentum: 0.000000 2023-10-25 00:52:14,414 epoch 3 - iter 288/723 - loss 0.06222395 - time (sec): 34.26 - samples/sec: 2041.25 - lr: 0.000025 - momentum: 0.000000 2023-10-25 00:52:23,153 epoch 3 - iter 360/723 - loss 0.06587325 - time (sec): 43.00 - samples/sec: 2035.00 - lr: 0.000025 - momentum: 0.000000 2023-10-25 00:52:31,858 epoch 3 - iter 432/723 - loss 0.06664348 - time (sec): 51.70 - samples/sec: 2022.37 - lr: 0.000025 - momentum: 0.000000 2023-10-25 00:52:39,975 epoch 3 - iter 504/723 - loss 0.06685226 - time (sec): 59.82 - samples/sec: 2035.71 - lr: 0.000024 - momentum: 0.000000 2023-10-25 00:52:49,293 epoch 3 - iter 576/723 - loss 0.06583708 - time (sec): 69.14 - samples/sec: 2035.50 - lr: 0.000024 - momentum: 0.000000 2023-10-25 00:52:58,250 epoch 3 - iter 648/723 - loss 0.06530041 - time (sec): 78.09 - samples/sec: 2027.41 - lr: 0.000024 - momentum: 0.000000 2023-10-25 00:53:06,920 epoch 3 - iter 720/723 - loss 0.06485314 - time (sec): 86.76 - samples/sec: 2025.22 - lr: 0.000023 - momentum: 0.000000 2023-10-25 00:53:07,226 ---------------------------------------------------------------------------------------------------- 2023-10-25 00:53:07,226 EPOCH 3 done: loss 0.0648 - lr: 0.000023 2023-10-25 00:53:10,958 DEV : loss 0.09126865863800049 - f1-score (micro avg) 0.808 2023-10-25 00:53:10,970 ---------------------------------------------------------------------------------------------------- 2023-10-25 00:53:19,814 epoch 4 - iter 72/723 - loss 0.05801881 - time (sec): 8.84 - samples/sec: 2014.17 - lr: 0.000023 - momentum: 0.000000 2023-10-25 00:53:28,442 epoch 4 - iter 144/723 - loss 0.05254569 - time (sec): 17.47 - samples/sec: 2000.81 - lr: 0.000023 - momentum: 0.000000 2023-10-25 00:53:37,056 epoch 4 - iter 216/723 - loss 0.04693350 - time (sec): 26.09 - samples/sec: 2048.72 - lr: 0.000022 - momentum: 0.000000 2023-10-25 00:53:45,825 epoch 4 - iter 288/723 - loss 0.04463563 - time (sec): 34.85 - samples/sec: 2056.73 - lr: 0.000022 - momentum: 0.000000 2023-10-25 00:53:53,675 epoch 4 - iter 360/723 - loss 0.04539628 - time (sec): 42.70 - samples/sec: 2052.75 - lr: 0.000022 - momentum: 0.000000 2023-10-25 00:54:02,452 epoch 4 - iter 432/723 - loss 0.04333194 - time (sec): 51.48 - samples/sec: 2043.45 - lr: 0.000021 - momentum: 0.000000 2023-10-25 00:54:11,013 epoch 4 - iter 504/723 - loss 0.04210679 - time (sec): 60.04 - samples/sec: 2041.87 - lr: 0.000021 - momentum: 0.000000 2023-10-25 00:54:19,932 epoch 4 - iter 576/723 - loss 0.04334323 - time (sec): 68.96 - samples/sec: 2036.56 - lr: 0.000021 - momentum: 0.000000 2023-10-25 00:54:28,525 epoch 4 - iter 648/723 - loss 0.04361753 - time (sec): 77.55 - samples/sec: 2040.35 - lr: 0.000020 - momentum: 0.000000 2023-10-25 00:54:37,091 epoch 4 - iter 720/723 - loss 0.04348726 - time (sec): 86.12 - samples/sec: 2041.20 - lr: 0.000020 - momentum: 0.000000 2023-10-25 00:54:37,376 ---------------------------------------------------------------------------------------------------- 2023-10-25 00:54:37,376 EPOCH 4 done: loss 0.0434 - lr: 0.000020 2023-10-25 00:54:40,820 DEV : loss 0.08081907033920288 - f1-score (micro avg) 0.8266 2023-10-25 00:54:40,832 saving best model 2023-10-25 00:54:41,493 ---------------------------------------------------------------------------------------------------- 2023-10-25 00:54:50,199 epoch 5 - iter 72/723 - loss 0.02338563 - time (sec): 8.71 - samples/sec: 1947.70 - lr: 0.000020 - momentum: 0.000000 2023-10-25 00:54:58,510 epoch 5 - iter 144/723 - loss 0.02569534 - time (sec): 17.02 - samples/sec: 1999.83 - lr: 0.000019 - momentum: 0.000000 2023-10-25 00:55:07,462 epoch 5 - iter 216/723 - loss 0.02462389 - time (sec): 25.97 - samples/sec: 2016.31 - lr: 0.000019 - momentum: 0.000000 2023-10-25 00:55:15,945 epoch 5 - iter 288/723 - loss 0.02616514 - time (sec): 34.45 - samples/sec: 2010.79 - lr: 0.000019 - momentum: 0.000000 2023-10-25 00:55:24,734 epoch 5 - iter 360/723 - loss 0.03003901 - time (sec): 43.24 - samples/sec: 2010.78 - lr: 0.000018 - momentum: 0.000000 2023-10-25 00:55:34,005 epoch 5 - iter 432/723 - loss 0.02923547 - time (sec): 52.51 - samples/sec: 2015.49 - lr: 0.000018 - momentum: 0.000000 2023-10-25 00:55:42,392 epoch 5 - iter 504/723 - loss 0.03072182 - time (sec): 60.90 - samples/sec: 2020.91 - lr: 0.000018 - momentum: 0.000000 2023-10-25 00:55:50,760 epoch 5 - iter 576/723 - loss 0.03169806 - time (sec): 69.27 - samples/sec: 2024.04 - lr: 0.000017 - momentum: 0.000000 2023-10-25 00:55:59,385 epoch 5 - iter 648/723 - loss 0.03050389 - time (sec): 77.89 - samples/sec: 2032.02 - lr: 0.000017 - momentum: 0.000000 2023-10-25 00:56:08,048 epoch 5 - iter 720/723 - loss 0.03129899 - time (sec): 86.55 - samples/sec: 2030.87 - lr: 0.000017 - momentum: 0.000000 2023-10-25 00:56:08,339 ---------------------------------------------------------------------------------------------------- 2023-10-25 00:56:08,340 EPOCH 5 done: loss 0.0312 - lr: 0.000017 2023-10-25 00:56:11,780 DEV : loss 0.10549487918615341 - f1-score (micro avg) 0.8373 2023-10-25 00:56:11,791 saving best model 2023-10-25 00:56:12,497 ---------------------------------------------------------------------------------------------------- 2023-10-25 00:56:21,422 epoch 6 - iter 72/723 - loss 0.01906613 - time (sec): 8.92 - samples/sec: 2022.88 - lr: 0.000016 - momentum: 0.000000 2023-10-25 00:56:29,665 epoch 6 - iter 144/723 - loss 0.02198542 - time (sec): 17.17 - samples/sec: 2044.11 - lr: 0.000016 - momentum: 0.000000 2023-10-25 00:56:38,022 epoch 6 - iter 216/723 - loss 0.02191414 - time (sec): 25.52 - samples/sec: 2063.73 - lr: 0.000016 - momentum: 0.000000 2023-10-25 00:56:46,711 epoch 6 - iter 288/723 - loss 0.02097993 - time (sec): 34.21 - samples/sec: 2050.63 - lr: 0.000015 - momentum: 0.000000 2023-10-25 00:56:55,459 epoch 6 - iter 360/723 - loss 0.02000892 - time (sec): 42.96 - samples/sec: 2051.17 - lr: 0.000015 - momentum: 0.000000 2023-10-25 00:57:03,945 epoch 6 - iter 432/723 - loss 0.02037415 - time (sec): 51.45 - samples/sec: 2047.64 - lr: 0.000015 - momentum: 0.000000 2023-10-25 00:57:12,254 epoch 6 - iter 504/723 - loss 0.02043936 - time (sec): 59.76 - samples/sec: 2048.27 - lr: 0.000014 - momentum: 0.000000 2023-10-25 00:57:20,921 epoch 6 - iter 576/723 - loss 0.02169775 - time (sec): 68.42 - samples/sec: 2047.48 - lr: 0.000014 - momentum: 0.000000 2023-10-25 00:57:30,035 epoch 6 - iter 648/723 - loss 0.02173051 - time (sec): 77.54 - samples/sec: 2051.76 - lr: 0.000014 - momentum: 0.000000 2023-10-25 00:57:38,493 epoch 6 - iter 720/723 - loss 0.02229266 - time (sec): 86.00 - samples/sec: 2045.05 - lr: 0.000013 - momentum: 0.000000 2023-10-25 00:57:38,722 ---------------------------------------------------------------------------------------------------- 2023-10-25 00:57:38,723 EPOCH 6 done: loss 0.0224 - lr: 0.000013 2023-10-25 00:57:42,454 DEV : loss 0.13726861774921417 - f1-score (micro avg) 0.8114 2023-10-25 00:57:42,465 ---------------------------------------------------------------------------------------------------- 2023-10-25 00:57:51,002 epoch 7 - iter 72/723 - loss 0.01036445 - time (sec): 8.54 - samples/sec: 2045.42 - lr: 0.000013 - momentum: 0.000000 2023-10-25 00:57:59,138 epoch 7 - iter 144/723 - loss 0.01477027 - time (sec): 16.67 - samples/sec: 2035.45 - lr: 0.000013 - momentum: 0.000000 2023-10-25 00:58:08,726 epoch 7 - iter 216/723 - loss 0.01492918 - time (sec): 26.26 - samples/sec: 2050.94 - lr: 0.000012 - momentum: 0.000000 2023-10-25 00:58:17,268 epoch 7 - iter 288/723 - loss 0.01479687 - time (sec): 34.80 - samples/sec: 2046.22 - lr: 0.000012 - momentum: 0.000000 2023-10-25 00:58:26,329 epoch 7 - iter 360/723 - loss 0.01577013 - time (sec): 43.86 - samples/sec: 2037.22 - lr: 0.000012 - momentum: 0.000000 2023-10-25 00:58:34,383 epoch 7 - iter 432/723 - loss 0.01582818 - time (sec): 51.92 - samples/sec: 2038.95 - lr: 0.000011 - momentum: 0.000000 2023-10-25 00:58:43,970 epoch 7 - iter 504/723 - loss 0.01747443 - time (sec): 61.50 - samples/sec: 2030.49 - lr: 0.000011 - momentum: 0.000000 2023-10-25 00:58:52,302 epoch 7 - iter 576/723 - loss 0.01761193 - time (sec): 69.84 - samples/sec: 2019.28 - lr: 0.000011 - momentum: 0.000000 2023-10-25 00:59:01,085 epoch 7 - iter 648/723 - loss 0.01778744 - time (sec): 78.62 - samples/sec: 2017.48 - lr: 0.000010 - momentum: 0.000000 2023-10-25 00:59:08,891 epoch 7 - iter 720/723 - loss 0.01746302 - time (sec): 86.43 - samples/sec: 2033.21 - lr: 0.000010 - momentum: 0.000000 2023-10-25 00:59:09,138 ---------------------------------------------------------------------------------------------------- 2023-10-25 00:59:09,139 EPOCH 7 done: loss 0.0174 - lr: 0.000010 2023-10-25 00:59:12,866 DEV : loss 0.15865251421928406 - f1-score (micro avg) 0.8228 2023-10-25 00:59:12,877 ---------------------------------------------------------------------------------------------------- 2023-10-25 00:59:21,084 epoch 8 - iter 72/723 - loss 0.00869406 - time (sec): 8.21 - samples/sec: 2053.12 - lr: 0.000010 - momentum: 0.000000 2023-10-25 00:59:30,064 epoch 8 - iter 144/723 - loss 0.01148832 - time (sec): 17.19 - samples/sec: 1994.82 - lr: 0.000009 - momentum: 0.000000 2023-10-25 00:59:38,374 epoch 8 - iter 216/723 - loss 0.01080879 - time (sec): 25.50 - samples/sec: 1984.63 - lr: 0.000009 - momentum: 0.000000 2023-10-25 00:59:48,252 epoch 8 - iter 288/723 - loss 0.01027925 - time (sec): 35.37 - samples/sec: 1984.33 - lr: 0.000009 - momentum: 0.000000 2023-10-25 00:59:56,717 epoch 8 - iter 360/723 - loss 0.00944012 - time (sec): 43.84 - samples/sec: 1999.65 - lr: 0.000008 - momentum: 0.000000 2023-10-25 01:00:05,292 epoch 8 - iter 432/723 - loss 0.00925629 - time (sec): 52.41 - samples/sec: 2007.54 - lr: 0.000008 - momentum: 0.000000 2023-10-25 01:00:13,956 epoch 8 - iter 504/723 - loss 0.01038197 - time (sec): 61.08 - samples/sec: 2014.29 - lr: 0.000008 - momentum: 0.000000 2023-10-25 01:00:22,579 epoch 8 - iter 576/723 - loss 0.01113833 - time (sec): 69.70 - samples/sec: 2016.00 - lr: 0.000007 - momentum: 0.000000 2023-10-25 01:00:30,912 epoch 8 - iter 648/723 - loss 0.01148794 - time (sec): 78.03 - samples/sec: 2021.46 - lr: 0.000007 - momentum: 0.000000 2023-10-25 01:00:39,400 epoch 8 - iter 720/723 - loss 0.01137677 - time (sec): 86.52 - samples/sec: 2031.51 - lr: 0.000007 - momentum: 0.000000 2023-10-25 01:00:39,658 ---------------------------------------------------------------------------------------------------- 2023-10-25 01:00:39,658 EPOCH 8 done: loss 0.0114 - lr: 0.000007 2023-10-25 01:00:43,088 DEV : loss 0.15589797496795654 - f1-score (micro avg) 0.8371 2023-10-25 01:00:43,100 ---------------------------------------------------------------------------------------------------- 2023-10-25 01:00:51,884 epoch 9 - iter 72/723 - loss 0.00818030 - time (sec): 8.78 - samples/sec: 2043.18 - lr: 0.000006 - momentum: 0.000000 2023-10-25 01:01:00,425 epoch 9 - iter 144/723 - loss 0.00706545 - time (sec): 17.32 - samples/sec: 2038.06 - lr: 0.000006 - momentum: 0.000000 2023-10-25 01:01:09,236 epoch 9 - iter 216/723 - loss 0.00854649 - time (sec): 26.14 - samples/sec: 2028.69 - lr: 0.000006 - momentum: 0.000000 2023-10-25 01:01:18,365 epoch 9 - iter 288/723 - loss 0.00824646 - time (sec): 35.26 - samples/sec: 2028.84 - lr: 0.000005 - momentum: 0.000000 2023-10-25 01:01:26,827 epoch 9 - iter 360/723 - loss 0.00895845 - time (sec): 43.73 - samples/sec: 2021.73 - lr: 0.000005 - momentum: 0.000000 2023-10-25 01:01:35,238 epoch 9 - iter 432/723 - loss 0.00874757 - time (sec): 52.14 - samples/sec: 2021.16 - lr: 0.000005 - momentum: 0.000000 2023-10-25 01:01:43,809 epoch 9 - iter 504/723 - loss 0.00840101 - time (sec): 60.71 - samples/sec: 2027.92 - lr: 0.000004 - momentum: 0.000000 2023-10-25 01:01:52,727 epoch 9 - iter 576/723 - loss 0.00845279 - time (sec): 69.63 - samples/sec: 2033.64 - lr: 0.000004 - momentum: 0.000000 2023-10-25 01:02:00,977 epoch 9 - iter 648/723 - loss 0.00929842 - time (sec): 77.88 - samples/sec: 2032.69 - lr: 0.000004 - momentum: 0.000000 2023-10-25 01:02:09,687 epoch 9 - iter 720/723 - loss 0.00918250 - time (sec): 86.59 - samples/sec: 2029.07 - lr: 0.000003 - momentum: 0.000000 2023-10-25 01:02:09,944 ---------------------------------------------------------------------------------------------------- 2023-10-25 01:02:09,944 EPOCH 9 done: loss 0.0092 - lr: 0.000003 2023-10-25 01:02:13,676 DEV : loss 0.1608613282442093 - f1-score (micro avg) 0.8437 2023-10-25 01:02:13,688 saving best model 2023-10-25 01:02:14,385 ---------------------------------------------------------------------------------------------------- 2023-10-25 01:02:22,848 epoch 10 - iter 72/723 - loss 0.00314395 - time (sec): 8.46 - samples/sec: 2042.65 - lr: 0.000003 - momentum: 0.000000 2023-10-25 01:02:31,969 epoch 10 - iter 144/723 - loss 0.00608469 - time (sec): 17.58 - samples/sec: 1985.96 - lr: 0.000003 - momentum: 0.000000 2023-10-25 01:02:40,271 epoch 10 - iter 216/723 - loss 0.00571761 - time (sec): 25.88 - samples/sec: 2012.57 - lr: 0.000002 - momentum: 0.000000 2023-10-25 01:02:48,902 epoch 10 - iter 288/723 - loss 0.00465757 - time (sec): 34.52 - samples/sec: 2029.54 - lr: 0.000002 - momentum: 0.000000 2023-10-25 01:02:57,387 epoch 10 - iter 360/723 - loss 0.00418405 - time (sec): 43.00 - samples/sec: 2032.09 - lr: 0.000002 - momentum: 0.000000 2023-10-25 01:03:06,752 epoch 10 - iter 432/723 - loss 0.00539257 - time (sec): 52.37 - samples/sec: 2043.39 - lr: 0.000001 - momentum: 0.000000 2023-10-25 01:03:15,337 epoch 10 - iter 504/723 - loss 0.00490609 - time (sec): 60.95 - samples/sec: 2044.20 - lr: 0.000001 - momentum: 0.000000 2023-10-25 01:03:24,080 epoch 10 - iter 576/723 - loss 0.00568923 - time (sec): 69.69 - samples/sec: 2042.65 - lr: 0.000001 - momentum: 0.000000 2023-10-25 01:03:32,038 epoch 10 - iter 648/723 - loss 0.00557638 - time (sec): 77.65 - samples/sec: 2042.04 - lr: 0.000000 - momentum: 0.000000 2023-10-25 01:03:40,574 epoch 10 - iter 720/723 - loss 0.00538824 - time (sec): 86.19 - samples/sec: 2040.13 - lr: 0.000000 - momentum: 0.000000 2023-10-25 01:03:40,808 ---------------------------------------------------------------------------------------------------- 2023-10-25 01:03:40,808 EPOCH 10 done: loss 0.0054 - lr: 0.000000 2023-10-25 01:03:44,232 DEV : loss 0.17519354820251465 - f1-score (micro avg) 0.8334 2023-10-25 01:03:44,796 ---------------------------------------------------------------------------------------------------- 2023-10-25 01:03:44,797 Loading model from best epoch ... 2023-10-25 01:03:46,563 SequenceTagger predicts: Dictionary with 13 tags: O, S-LOC, B-LOC, E-LOC, I-LOC, S-PER, B-PER, E-PER, I-PER, S-ORG, B-ORG, E-ORG, I-ORG 2023-10-25 01:03:50,088 Results: - F-score (micro) 0.8164 - F-score (macro) 0.724 - Accuracy 0.6989 By class: precision recall f1-score support PER 0.8412 0.8133 0.8270 482 LOC 0.8842 0.8166 0.8490 458 ORG 0.5769 0.4348 0.4959 69 micro avg 0.8459 0.7889 0.8164 1009 macro avg 0.7674 0.6882 0.7240 1009 weighted avg 0.8426 0.7889 0.8144 1009 2023-10-25 01:03:50,088 ----------------------------------------------------------------------------------------------------