2023-10-25 15:38:47,008 ---------------------------------------------------------------------------------------------------- 2023-10-25 15:38:47,009 Model: "SequenceTagger( (embeddings): TransformerWordEmbeddings( (model): BertModel( (embeddings): BertEmbeddings( (word_embeddings): Embedding(64001, 768) (position_embeddings): Embedding(512, 768) (token_type_embeddings): Embedding(2, 768) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): BertEncoder( (layer): ModuleList( (0): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (1): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (2): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (3): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (4): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (5): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (6): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (7): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (8): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (9): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (10): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (11): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) (pooler): BertPooler( (dense): Linear(in_features=768, out_features=768, bias=True) (activation): Tanh() ) ) ) (locked_dropout): LockedDropout(p=0.5) (linear): Linear(in_features=768, out_features=13, bias=True) (loss_function): CrossEntropyLoss() )" 2023-10-25 15:38:47,009 ---------------------------------------------------------------------------------------------------- 2023-10-25 15:38:47,009 MultiCorpus: 14465 train + 1392 dev + 2432 test sentences - NER_HIPE_2022 Corpus: 14465 train + 1392 dev + 2432 test sentences - /home/ubuntu/.flair/datasets/ner_hipe_2022/v2.1/letemps/fr/with_doc_seperator 2023-10-25 15:38:47,009 ---------------------------------------------------------------------------------------------------- 2023-10-25 15:38:47,009 Train: 14465 sentences 2023-10-25 15:38:47,009 (train_with_dev=False, train_with_test=False) 2023-10-25 15:38:47,009 ---------------------------------------------------------------------------------------------------- 2023-10-25 15:38:47,009 Training Params: 2023-10-25 15:38:47,009 - learning_rate: "3e-05" 2023-10-25 15:38:47,009 - mini_batch_size: "4" 2023-10-25 15:38:47,009 - max_epochs: "10" 2023-10-25 15:38:47,009 - shuffle: "True" 2023-10-25 15:38:47,009 ---------------------------------------------------------------------------------------------------- 2023-10-25 15:38:47,009 Plugins: 2023-10-25 15:38:47,009 - TensorboardLogger 2023-10-25 15:38:47,009 - LinearScheduler | warmup_fraction: '0.1' 2023-10-25 15:38:47,009 ---------------------------------------------------------------------------------------------------- 2023-10-25 15:38:47,009 Final evaluation on model from best epoch (best-model.pt) 2023-10-25 15:38:47,009 - metric: "('micro avg', 'f1-score')" 2023-10-25 15:38:47,009 ---------------------------------------------------------------------------------------------------- 2023-10-25 15:38:47,009 Computation: 2023-10-25 15:38:47,009 - compute on device: cuda:0 2023-10-25 15:38:47,009 - embedding storage: none 2023-10-25 15:38:47,009 ---------------------------------------------------------------------------------------------------- 2023-10-25 15:38:47,009 Model training base path: "hmbench-letemps/fr-dbmdz/bert-base-historic-multilingual-64k-td-cased-bs4-wsFalse-e10-lr3e-05-poolingfirst-layers-1-crfFalse-4" 2023-10-25 15:38:47,009 ---------------------------------------------------------------------------------------------------- 2023-10-25 15:38:47,009 ---------------------------------------------------------------------------------------------------- 2023-10-25 15:38:47,009 Logging anything other than scalars to TensorBoard is currently not supported. 2023-10-25 15:39:09,553 epoch 1 - iter 361/3617 - loss 1.19886281 - time (sec): 22.54 - samples/sec: 1661.18 - lr: 0.000003 - momentum: 0.000000 2023-10-25 15:39:32,433 epoch 1 - iter 722/3617 - loss 0.68831441 - time (sec): 45.42 - samples/sec: 1682.42 - lr: 0.000006 - momentum: 0.000000 2023-10-25 15:39:54,936 epoch 1 - iter 1083/3617 - loss 0.51170947 - time (sec): 67.93 - samples/sec: 1667.26 - lr: 0.000009 - momentum: 0.000000 2023-10-25 15:40:17,679 epoch 1 - iter 1444/3617 - loss 0.41162390 - time (sec): 90.67 - samples/sec: 1675.46 - lr: 0.000012 - momentum: 0.000000 2023-10-25 15:40:40,401 epoch 1 - iter 1805/3617 - loss 0.35326768 - time (sec): 113.39 - samples/sec: 1674.11 - lr: 0.000015 - momentum: 0.000000 2023-10-25 15:41:03,110 epoch 1 - iter 2166/3617 - loss 0.31329417 - time (sec): 136.10 - samples/sec: 1682.14 - lr: 0.000018 - momentum: 0.000000 2023-10-25 15:41:25,626 epoch 1 - iter 2527/3617 - loss 0.28535492 - time (sec): 158.62 - samples/sec: 1680.30 - lr: 0.000021 - momentum: 0.000000 2023-10-25 15:41:48,351 epoch 1 - iter 2888/3617 - loss 0.26499215 - time (sec): 181.34 - samples/sec: 1682.04 - lr: 0.000024 - momentum: 0.000000 2023-10-25 15:42:10,950 epoch 1 - iter 3249/3617 - loss 0.24775587 - time (sec): 203.94 - samples/sec: 1678.70 - lr: 0.000027 - momentum: 0.000000 2023-10-25 15:42:33,298 epoch 1 - iter 3610/3617 - loss 0.23447570 - time (sec): 226.29 - samples/sec: 1675.45 - lr: 0.000030 - momentum: 0.000000 2023-10-25 15:42:33,748 ---------------------------------------------------------------------------------------------------- 2023-10-25 15:42:33,748 EPOCH 1 done: loss 0.2341 - lr: 0.000030 2023-10-25 15:42:38,729 DEV : loss 0.12141559273004532 - f1-score (micro avg) 0.6425 2023-10-25 15:42:38,752 saving best model 2023-10-25 15:42:39,301 ---------------------------------------------------------------------------------------------------- 2023-10-25 15:43:02,256 epoch 2 - iter 361/3617 - loss 0.10229911 - time (sec): 22.95 - samples/sec: 1700.22 - lr: 0.000030 - momentum: 0.000000 2023-10-25 15:43:24,847 epoch 2 - iter 722/3617 - loss 0.10119859 - time (sec): 45.55 - samples/sec: 1678.69 - lr: 0.000029 - momentum: 0.000000 2023-10-25 15:43:47,602 epoch 2 - iter 1083/3617 - loss 0.10115542 - time (sec): 68.30 - samples/sec: 1671.77 - lr: 0.000029 - momentum: 0.000000 2023-10-25 15:44:10,203 epoch 2 - iter 1444/3617 - loss 0.10048881 - time (sec): 90.90 - samples/sec: 1676.55 - lr: 0.000029 - momentum: 0.000000 2023-10-25 15:44:32,710 epoch 2 - iter 1805/3617 - loss 0.09969364 - time (sec): 113.41 - samples/sec: 1664.78 - lr: 0.000028 - momentum: 0.000000 2023-10-25 15:44:55,757 epoch 2 - iter 2166/3617 - loss 0.09899570 - time (sec): 136.45 - samples/sec: 1680.19 - lr: 0.000028 - momentum: 0.000000 2023-10-25 15:45:18,336 epoch 2 - iter 2527/3617 - loss 0.09812338 - time (sec): 159.03 - samples/sec: 1675.61 - lr: 0.000028 - momentum: 0.000000 2023-10-25 15:45:40,866 epoch 2 - iter 2888/3617 - loss 0.09881509 - time (sec): 181.56 - samples/sec: 1674.82 - lr: 0.000027 - momentum: 0.000000 2023-10-25 15:46:03,446 epoch 2 - iter 3249/3617 - loss 0.09818580 - time (sec): 204.14 - samples/sec: 1678.00 - lr: 0.000027 - momentum: 0.000000 2023-10-25 15:46:26,000 epoch 2 - iter 3610/3617 - loss 0.09912199 - time (sec): 226.70 - samples/sec: 1673.03 - lr: 0.000027 - momentum: 0.000000 2023-10-25 15:46:26,427 ---------------------------------------------------------------------------------------------------- 2023-10-25 15:46:26,427 EPOCH 2 done: loss 0.0991 - lr: 0.000027 2023-10-25 15:46:31,155 DEV : loss 0.10703670233488083 - f1-score (micro avg) 0.5748 2023-10-25 15:46:31,178 ---------------------------------------------------------------------------------------------------- 2023-10-25 15:46:54,123 epoch 3 - iter 361/3617 - loss 0.06451220 - time (sec): 22.94 - samples/sec: 1635.94 - lr: 0.000026 - momentum: 0.000000 2023-10-25 15:47:17,018 epoch 3 - iter 722/3617 - loss 0.07071297 - time (sec): 45.84 - samples/sec: 1661.24 - lr: 0.000026 - momentum: 0.000000 2023-10-25 15:47:39,779 epoch 3 - iter 1083/3617 - loss 0.07369259 - time (sec): 68.60 - samples/sec: 1671.64 - lr: 0.000026 - momentum: 0.000000 2023-10-25 15:48:02,401 epoch 3 - iter 1444/3617 - loss 0.07364717 - time (sec): 91.22 - samples/sec: 1660.50 - lr: 0.000025 - momentum: 0.000000 2023-10-25 15:48:25,138 epoch 3 - iter 1805/3617 - loss 0.07368861 - time (sec): 113.96 - samples/sec: 1661.84 - lr: 0.000025 - momentum: 0.000000 2023-10-25 15:48:47,723 epoch 3 - iter 2166/3617 - loss 0.07186830 - time (sec): 136.54 - samples/sec: 1669.15 - lr: 0.000025 - momentum: 0.000000 2023-10-25 15:49:10,464 epoch 3 - iter 2527/3617 - loss 0.07251865 - time (sec): 159.28 - samples/sec: 1672.41 - lr: 0.000024 - momentum: 0.000000 2023-10-25 15:49:32,837 epoch 3 - iter 2888/3617 - loss 0.07304301 - time (sec): 181.66 - samples/sec: 1667.81 - lr: 0.000024 - momentum: 0.000000 2023-10-25 15:49:55,910 epoch 3 - iter 3249/3617 - loss 0.07314916 - time (sec): 204.73 - samples/sec: 1669.74 - lr: 0.000024 - momentum: 0.000000 2023-10-25 15:50:18,464 epoch 3 - iter 3610/3617 - loss 0.07317717 - time (sec): 227.28 - samples/sec: 1667.96 - lr: 0.000023 - momentum: 0.000000 2023-10-25 15:50:18,929 ---------------------------------------------------------------------------------------------------- 2023-10-25 15:50:18,929 EPOCH 3 done: loss 0.0731 - lr: 0.000023 2023-10-25 15:50:23,703 DEV : loss 0.22103023529052734 - f1-score (micro avg) 0.6461 2023-10-25 15:50:23,726 saving best model 2023-10-25 15:50:24,448 ---------------------------------------------------------------------------------------------------- 2023-10-25 15:50:47,337 epoch 4 - iter 361/3617 - loss 0.04349179 - time (sec): 22.89 - samples/sec: 1687.96 - lr: 0.000023 - momentum: 0.000000 2023-10-25 15:51:09,914 epoch 4 - iter 722/3617 - loss 0.04765068 - time (sec): 45.47 - samples/sec: 1697.83 - lr: 0.000023 - momentum: 0.000000 2023-10-25 15:51:32,736 epoch 4 - iter 1083/3617 - loss 0.04598577 - time (sec): 68.29 - samples/sec: 1696.69 - lr: 0.000022 - momentum: 0.000000 2023-10-25 15:51:55,386 epoch 4 - iter 1444/3617 - loss 0.04854533 - time (sec): 90.94 - samples/sec: 1670.10 - lr: 0.000022 - momentum: 0.000000 2023-10-25 15:52:17,964 epoch 4 - iter 1805/3617 - loss 0.05057207 - time (sec): 113.51 - samples/sec: 1665.68 - lr: 0.000022 - momentum: 0.000000 2023-10-25 15:52:40,951 epoch 4 - iter 2166/3617 - loss 0.05016728 - time (sec): 136.50 - samples/sec: 1678.27 - lr: 0.000021 - momentum: 0.000000 2023-10-25 15:53:03,727 epoch 4 - iter 2527/3617 - loss 0.05059513 - time (sec): 159.28 - samples/sec: 1678.58 - lr: 0.000021 - momentum: 0.000000 2023-10-25 15:53:26,318 epoch 4 - iter 2888/3617 - loss 0.05293486 - time (sec): 181.87 - samples/sec: 1675.87 - lr: 0.000021 - momentum: 0.000000 2023-10-25 15:53:49,399 epoch 4 - iter 3249/3617 - loss 0.05285239 - time (sec): 204.95 - samples/sec: 1669.80 - lr: 0.000020 - momentum: 0.000000 2023-10-25 15:54:11,936 epoch 4 - iter 3610/3617 - loss 0.05260123 - time (sec): 227.49 - samples/sec: 1666.60 - lr: 0.000020 - momentum: 0.000000 2023-10-25 15:54:12,392 ---------------------------------------------------------------------------------------------------- 2023-10-25 15:54:12,392 EPOCH 4 done: loss 0.0525 - lr: 0.000020 2023-10-25 15:54:17,149 DEV : loss 0.24151772260665894 - f1-score (micro avg) 0.6262 2023-10-25 15:54:17,172 ---------------------------------------------------------------------------------------------------- 2023-10-25 15:54:39,932 epoch 5 - iter 361/3617 - loss 0.02889314 - time (sec): 22.76 - samples/sec: 1633.70 - lr: 0.000020 - momentum: 0.000000 2023-10-25 15:55:02,383 epoch 5 - iter 722/3617 - loss 0.02845671 - time (sec): 45.21 - samples/sec: 1640.80 - lr: 0.000019 - momentum: 0.000000 2023-10-25 15:55:25,074 epoch 5 - iter 1083/3617 - loss 0.02905149 - time (sec): 67.90 - samples/sec: 1652.84 - lr: 0.000019 - momentum: 0.000000 2023-10-25 15:55:47,562 epoch 5 - iter 1444/3617 - loss 0.03106635 - time (sec): 90.39 - samples/sec: 1657.09 - lr: 0.000019 - momentum: 0.000000 2023-10-25 15:56:10,156 epoch 5 - iter 1805/3617 - loss 0.03395971 - time (sec): 112.98 - samples/sec: 1662.83 - lr: 0.000018 - momentum: 0.000000 2023-10-25 15:56:32,681 epoch 5 - iter 2166/3617 - loss 0.03439912 - time (sec): 135.51 - samples/sec: 1657.17 - lr: 0.000018 - momentum: 0.000000 2023-10-25 15:56:55,301 epoch 5 - iter 2527/3617 - loss 0.03516551 - time (sec): 158.13 - samples/sec: 1655.70 - lr: 0.000018 - momentum: 0.000000 2023-10-25 15:57:18,303 epoch 5 - iter 2888/3617 - loss 0.03540794 - time (sec): 181.13 - samples/sec: 1670.73 - lr: 0.000017 - momentum: 0.000000 2023-10-25 15:57:40,862 epoch 5 - iter 3249/3617 - loss 0.03667999 - time (sec): 203.69 - samples/sec: 1666.12 - lr: 0.000017 - momentum: 0.000000 2023-10-25 15:58:03,741 epoch 5 - iter 3610/3617 - loss 0.03647650 - time (sec): 226.57 - samples/sec: 1674.18 - lr: 0.000017 - momentum: 0.000000 2023-10-25 15:58:04,146 ---------------------------------------------------------------------------------------------------- 2023-10-25 15:58:04,146 EPOCH 5 done: loss 0.0365 - lr: 0.000017 2023-10-25 15:58:09,429 DEV : loss 0.27911558747291565 - f1-score (micro avg) 0.6411 2023-10-25 15:58:09,452 ---------------------------------------------------------------------------------------------------- 2023-10-25 15:58:32,153 epoch 6 - iter 361/3617 - loss 0.01844611 - time (sec): 22.70 - samples/sec: 1684.49 - lr: 0.000016 - momentum: 0.000000 2023-10-25 15:58:54,997 epoch 6 - iter 722/3617 - loss 0.01909398 - time (sec): 45.54 - samples/sec: 1660.35 - lr: 0.000016 - momentum: 0.000000 2023-10-25 15:59:18,007 epoch 6 - iter 1083/3617 - loss 0.02262065 - time (sec): 68.55 - samples/sec: 1690.87 - lr: 0.000016 - momentum: 0.000000 2023-10-25 15:59:40,435 epoch 6 - iter 1444/3617 - loss 0.02337790 - time (sec): 90.98 - samples/sec: 1680.22 - lr: 0.000015 - momentum: 0.000000 2023-10-25 16:00:03,256 epoch 6 - iter 1805/3617 - loss 0.02294877 - time (sec): 113.80 - samples/sec: 1686.69 - lr: 0.000015 - momentum: 0.000000 2023-10-25 16:00:25,717 epoch 6 - iter 2166/3617 - loss 0.02260980 - time (sec): 136.26 - samples/sec: 1685.27 - lr: 0.000015 - momentum: 0.000000 2023-10-25 16:00:48,516 epoch 6 - iter 2527/3617 - loss 0.02245400 - time (sec): 159.06 - samples/sec: 1682.99 - lr: 0.000014 - momentum: 0.000000 2023-10-25 16:01:11,169 epoch 6 - iter 2888/3617 - loss 0.02342671 - time (sec): 181.72 - samples/sec: 1678.37 - lr: 0.000014 - momentum: 0.000000 2023-10-25 16:01:33,582 epoch 6 - iter 3249/3617 - loss 0.02358711 - time (sec): 204.13 - samples/sec: 1671.21 - lr: 0.000014 - momentum: 0.000000 2023-10-25 16:01:56,285 epoch 6 - iter 3610/3617 - loss 0.02404475 - time (sec): 226.83 - samples/sec: 1671.46 - lr: 0.000013 - momentum: 0.000000 2023-10-25 16:01:56,730 ---------------------------------------------------------------------------------------------------- 2023-10-25 16:01:56,730 EPOCH 6 done: loss 0.0240 - lr: 0.000013 2023-10-25 16:02:02,029 DEV : loss 0.30914661288261414 - f1-score (micro avg) 0.6277 2023-10-25 16:02:02,052 ---------------------------------------------------------------------------------------------------- 2023-10-25 16:02:24,697 epoch 7 - iter 361/3617 - loss 0.01518405 - time (sec): 22.64 - samples/sec: 1682.07 - lr: 0.000013 - momentum: 0.000000 2023-10-25 16:02:47,369 epoch 7 - iter 722/3617 - loss 0.01696119 - time (sec): 45.32 - samples/sec: 1686.25 - lr: 0.000013 - momentum: 0.000000 2023-10-25 16:03:10,138 epoch 7 - iter 1083/3617 - loss 0.01811277 - time (sec): 68.08 - samples/sec: 1679.52 - lr: 0.000012 - momentum: 0.000000 2023-10-25 16:03:33,026 epoch 7 - iter 1444/3617 - loss 0.01753427 - time (sec): 90.97 - samples/sec: 1687.97 - lr: 0.000012 - momentum: 0.000000 2023-10-25 16:03:55,497 epoch 7 - iter 1805/3617 - loss 0.01779560 - time (sec): 113.44 - samples/sec: 1678.37 - lr: 0.000012 - momentum: 0.000000 2023-10-25 16:04:18,296 epoch 7 - iter 2166/3617 - loss 0.01660255 - time (sec): 136.24 - samples/sec: 1684.12 - lr: 0.000011 - momentum: 0.000000 2023-10-25 16:04:40,932 epoch 7 - iter 2527/3617 - loss 0.01698618 - time (sec): 158.88 - samples/sec: 1684.01 - lr: 0.000011 - momentum: 0.000000 2023-10-25 16:05:03,601 epoch 7 - iter 2888/3617 - loss 0.01728988 - time (sec): 181.55 - samples/sec: 1677.49 - lr: 0.000011 - momentum: 0.000000 2023-10-25 16:05:26,391 epoch 7 - iter 3249/3617 - loss 0.01721398 - time (sec): 204.34 - samples/sec: 1672.88 - lr: 0.000010 - momentum: 0.000000 2023-10-25 16:05:48,951 epoch 7 - iter 3610/3617 - loss 0.01693395 - time (sec): 226.90 - samples/sec: 1671.22 - lr: 0.000010 - momentum: 0.000000 2023-10-25 16:05:49,404 ---------------------------------------------------------------------------------------------------- 2023-10-25 16:05:49,404 EPOCH 7 done: loss 0.0169 - lr: 0.000010 2023-10-25 16:05:54,703 DEV : loss 0.35005614161491394 - f1-score (micro avg) 0.6476 2023-10-25 16:05:54,726 saving best model 2023-10-25 16:05:55,436 ---------------------------------------------------------------------------------------------------- 2023-10-25 16:06:18,101 epoch 8 - iter 361/3617 - loss 0.01402887 - time (sec): 22.66 - samples/sec: 1711.01 - lr: 0.000010 - momentum: 0.000000 2023-10-25 16:06:40,840 epoch 8 - iter 722/3617 - loss 0.01326071 - time (sec): 45.40 - samples/sec: 1682.72 - lr: 0.000009 - momentum: 0.000000 2023-10-25 16:07:03,485 epoch 8 - iter 1083/3617 - loss 0.01183986 - time (sec): 68.05 - samples/sec: 1685.05 - lr: 0.000009 - momentum: 0.000000 2023-10-25 16:07:26,172 epoch 8 - iter 1444/3617 - loss 0.01108027 - time (sec): 90.74 - samples/sec: 1677.66 - lr: 0.000009 - momentum: 0.000000 2023-10-25 16:07:48,816 epoch 8 - iter 1805/3617 - loss 0.01123144 - time (sec): 113.38 - samples/sec: 1672.07 - lr: 0.000008 - momentum: 0.000000 2023-10-25 16:08:11,441 epoch 8 - iter 2166/3617 - loss 0.01079378 - time (sec): 136.00 - samples/sec: 1672.45 - lr: 0.000008 - momentum: 0.000000 2023-10-25 16:08:34,072 epoch 8 - iter 2527/3617 - loss 0.01078423 - time (sec): 158.63 - samples/sec: 1670.37 - lr: 0.000008 - momentum: 0.000000 2023-10-25 16:08:56,585 epoch 8 - iter 2888/3617 - loss 0.01037040 - time (sec): 181.15 - samples/sec: 1664.64 - lr: 0.000007 - momentum: 0.000000 2023-10-25 16:09:19,603 epoch 8 - iter 3249/3617 - loss 0.01027725 - time (sec): 204.17 - samples/sec: 1671.72 - lr: 0.000007 - momentum: 0.000000 2023-10-25 16:09:42,381 epoch 8 - iter 3610/3617 - loss 0.01022367 - time (sec): 226.94 - samples/sec: 1671.16 - lr: 0.000007 - momentum: 0.000000 2023-10-25 16:09:42,798 ---------------------------------------------------------------------------------------------------- 2023-10-25 16:09:42,799 EPOCH 8 done: loss 0.0102 - lr: 0.000007 2023-10-25 16:09:47,567 DEV : loss 0.3698480725288391 - f1-score (micro avg) 0.6525 2023-10-25 16:09:47,591 saving best model 2023-10-25 16:09:48,302 ---------------------------------------------------------------------------------------------------- 2023-10-25 16:10:11,572 epoch 9 - iter 361/3617 - loss 0.00763808 - time (sec): 23.27 - samples/sec: 1670.63 - lr: 0.000006 - momentum: 0.000000 2023-10-25 16:10:34,042 epoch 9 - iter 722/3617 - loss 0.00979704 - time (sec): 45.74 - samples/sec: 1646.34 - lr: 0.000006 - momentum: 0.000000 2023-10-25 16:10:56,797 epoch 9 - iter 1083/3617 - loss 0.00817557 - time (sec): 68.49 - samples/sec: 1662.63 - lr: 0.000006 - momentum: 0.000000 2023-10-25 16:11:19,697 epoch 9 - iter 1444/3617 - loss 0.00803821 - time (sec): 91.39 - samples/sec: 1664.55 - lr: 0.000005 - momentum: 0.000000 2023-10-25 16:11:42,445 epoch 9 - iter 1805/3617 - loss 0.00799518 - time (sec): 114.14 - samples/sec: 1674.93 - lr: 0.000005 - momentum: 0.000000 2023-10-25 16:12:04,911 epoch 9 - iter 2166/3617 - loss 0.00724524 - time (sec): 136.61 - samples/sec: 1664.75 - lr: 0.000005 - momentum: 0.000000 2023-10-25 16:12:27,624 epoch 9 - iter 2527/3617 - loss 0.00774410 - time (sec): 159.32 - samples/sec: 1659.46 - lr: 0.000004 - momentum: 0.000000 2023-10-25 16:12:50,435 epoch 9 - iter 2888/3617 - loss 0.00794723 - time (sec): 182.13 - samples/sec: 1665.12 - lr: 0.000004 - momentum: 0.000000 2023-10-25 16:13:13,176 epoch 9 - iter 3249/3617 - loss 0.00789576 - time (sec): 204.87 - samples/sec: 1665.74 - lr: 0.000004 - momentum: 0.000000 2023-10-25 16:13:35,910 epoch 9 - iter 3610/3617 - loss 0.00792795 - time (sec): 227.61 - samples/sec: 1666.40 - lr: 0.000003 - momentum: 0.000000 2023-10-25 16:13:36,335 ---------------------------------------------------------------------------------------------------- 2023-10-25 16:13:36,335 EPOCH 9 done: loss 0.0080 - lr: 0.000003 2023-10-25 16:13:41,094 DEV : loss 0.3735716640949249 - f1-score (micro avg) 0.6539 2023-10-25 16:13:41,117 saving best model 2023-10-25 16:13:41,777 ---------------------------------------------------------------------------------------------------- 2023-10-25 16:14:04,798 epoch 10 - iter 361/3617 - loss 0.00287944 - time (sec): 23.02 - samples/sec: 1742.64 - lr: 0.000003 - momentum: 0.000000 2023-10-25 16:14:27,220 epoch 10 - iter 722/3617 - loss 0.00450426 - time (sec): 45.44 - samples/sec: 1693.31 - lr: 0.000003 - momentum: 0.000000 2023-10-25 16:14:49,716 epoch 10 - iter 1083/3617 - loss 0.00489107 - time (sec): 67.94 - samples/sec: 1682.68 - lr: 0.000002 - momentum: 0.000000 2023-10-25 16:15:12,220 epoch 10 - iter 1444/3617 - loss 0.00465774 - time (sec): 90.44 - samples/sec: 1677.15 - lr: 0.000002 - momentum: 0.000000 2023-10-25 16:15:34,869 epoch 10 - iter 1805/3617 - loss 0.00452385 - time (sec): 113.09 - samples/sec: 1671.53 - lr: 0.000002 - momentum: 0.000000 2023-10-25 16:15:57,676 epoch 10 - iter 2166/3617 - loss 0.00459334 - time (sec): 135.90 - samples/sec: 1678.53 - lr: 0.000001 - momentum: 0.000000 2023-10-25 16:16:20,440 epoch 10 - iter 2527/3617 - loss 0.00459891 - time (sec): 158.66 - samples/sec: 1677.78 - lr: 0.000001 - momentum: 0.000000 2023-10-25 16:16:43,162 epoch 10 - iter 2888/3617 - loss 0.00448095 - time (sec): 181.38 - samples/sec: 1681.58 - lr: 0.000001 - momentum: 0.000000 2023-10-25 16:17:06,066 epoch 10 - iter 3249/3617 - loss 0.00453443 - time (sec): 204.29 - samples/sec: 1671.10 - lr: 0.000000 - momentum: 0.000000 2023-10-25 16:17:28,567 epoch 10 - iter 3610/3617 - loss 0.00449270 - time (sec): 226.79 - samples/sec: 1672.45 - lr: 0.000000 - momentum: 0.000000 2023-10-25 16:17:28,984 ---------------------------------------------------------------------------------------------------- 2023-10-25 16:17:28,985 EPOCH 10 done: loss 0.0045 - lr: 0.000000 2023-10-25 16:17:33,739 DEV : loss 0.4017893970012665 - f1-score (micro avg) 0.6536 2023-10-25 16:17:34,312 ---------------------------------------------------------------------------------------------------- 2023-10-25 16:17:34,312 Loading model from best epoch ... 2023-10-25 16:17:36,078 SequenceTagger predicts: Dictionary with 13 tags: O, S-loc, B-loc, E-loc, I-loc, S-pers, B-pers, E-pers, I-pers, S-org, B-org, E-org, I-org 2023-10-25 16:17:41,702 Results: - F-score (micro) 0.6735 - F-score (macro) 0.5491 - Accuracy 0.5216 By class: precision recall f1-score support loc 0.6490 0.7885 0.7120 591 pers 0.5944 0.7759 0.6731 357 org 0.3721 0.2025 0.2623 79 micro avg 0.6186 0.7390 0.6735 1027 macro avg 0.5385 0.5890 0.5491 1027 weighted avg 0.6087 0.7390 0.6639 1027 2023-10-25 16:17:41,702 ----------------------------------------------------------------------------------------------------