2023-10-25 11:49:28,311 ---------------------------------------------------------------------------------------------------- 2023-10-25 11:49:28,312 Model: "SequenceTagger( (embeddings): TransformerWordEmbeddings( (model): BertModel( (embeddings): BertEmbeddings( (word_embeddings): Embedding(64001, 768) (position_embeddings): Embedding(512, 768) (token_type_embeddings): Embedding(2, 768) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): BertEncoder( (layer): ModuleList( (0): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (1): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (2): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (3): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (4): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (5): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (6): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (7): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (8): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (9): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (10): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (11): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) (pooler): BertPooler( (dense): Linear(in_features=768, out_features=768, bias=True) (activation): Tanh() ) ) ) (locked_dropout): LockedDropout(p=0.5) (linear): Linear(in_features=768, out_features=13, bias=True) (loss_function): CrossEntropyLoss() )" 2023-10-25 11:49:28,312 ---------------------------------------------------------------------------------------------------- 2023-10-25 11:49:28,312 MultiCorpus: 14465 train + 1392 dev + 2432 test sentences - NER_HIPE_2022 Corpus: 14465 train + 1392 dev + 2432 test sentences - /home/ubuntu/.flair/datasets/ner_hipe_2022/v2.1/letemps/fr/with_doc_seperator 2023-10-25 11:49:28,312 ---------------------------------------------------------------------------------------------------- 2023-10-25 11:49:28,312 Train: 14465 sentences 2023-10-25 11:49:28,312 (train_with_dev=False, train_with_test=False) 2023-10-25 11:49:28,312 ---------------------------------------------------------------------------------------------------- 2023-10-25 11:49:28,312 Training Params: 2023-10-25 11:49:28,312 - learning_rate: "5e-05" 2023-10-25 11:49:28,312 - mini_batch_size: "4" 2023-10-25 11:49:28,312 - max_epochs: "10" 2023-10-25 11:49:28,312 - shuffle: "True" 2023-10-25 11:49:28,312 ---------------------------------------------------------------------------------------------------- 2023-10-25 11:49:28,312 Plugins: 2023-10-25 11:49:28,312 - TensorboardLogger 2023-10-25 11:49:28,312 - LinearScheduler | warmup_fraction: '0.1' 2023-10-25 11:49:28,312 ---------------------------------------------------------------------------------------------------- 2023-10-25 11:49:28,312 Final evaluation on model from best epoch (best-model.pt) 2023-10-25 11:49:28,312 - metric: "('micro avg', 'f1-score')" 2023-10-25 11:49:28,312 ---------------------------------------------------------------------------------------------------- 2023-10-25 11:49:28,312 Computation: 2023-10-25 11:49:28,312 - compute on device: cuda:0 2023-10-25 11:49:28,312 - embedding storage: none 2023-10-25 11:49:28,312 ---------------------------------------------------------------------------------------------------- 2023-10-25 11:49:28,313 Model training base path: "hmbench-letemps/fr-dbmdz/bert-base-historic-multilingual-64k-td-cased-bs4-wsFalse-e10-lr5e-05-poolingfirst-layers-1-crfFalse-2" 2023-10-25 11:49:28,313 ---------------------------------------------------------------------------------------------------- 2023-10-25 11:49:28,313 ---------------------------------------------------------------------------------------------------- 2023-10-25 11:49:28,313 Logging anything other than scalars to TensorBoard is currently not supported. 2023-10-25 11:49:50,809 epoch 1 - iter 361/3617 - loss 0.86993127 - time (sec): 22.50 - samples/sec: 1710.07 - lr: 0.000005 - momentum: 0.000000 2023-10-25 11:50:13,270 epoch 1 - iter 722/3617 - loss 0.52060995 - time (sec): 44.96 - samples/sec: 1692.19 - lr: 0.000010 - momentum: 0.000000 2023-10-25 11:50:35,928 epoch 1 - iter 1083/3617 - loss 0.39422738 - time (sec): 67.61 - samples/sec: 1687.75 - lr: 0.000015 - momentum: 0.000000 2023-10-25 11:50:58,635 epoch 1 - iter 1444/3617 - loss 0.32755860 - time (sec): 90.32 - samples/sec: 1677.54 - lr: 0.000020 - momentum: 0.000000 2023-10-25 11:51:20,989 epoch 1 - iter 1805/3617 - loss 0.28743077 - time (sec): 112.68 - samples/sec: 1664.91 - lr: 0.000025 - momentum: 0.000000 2023-10-25 11:51:43,571 epoch 1 - iter 2166/3617 - loss 0.25847487 - time (sec): 135.26 - samples/sec: 1665.35 - lr: 0.000030 - momentum: 0.000000 2023-10-25 11:52:06,563 epoch 1 - iter 2527/3617 - loss 0.23733988 - time (sec): 158.25 - samples/sec: 1674.91 - lr: 0.000035 - momentum: 0.000000 2023-10-25 11:52:29,326 epoch 1 - iter 2888/3617 - loss 0.22332063 - time (sec): 181.01 - samples/sec: 1678.95 - lr: 0.000040 - momentum: 0.000000 2023-10-25 11:52:51,919 epoch 1 - iter 3249/3617 - loss 0.21156741 - time (sec): 203.61 - samples/sec: 1675.79 - lr: 0.000045 - momentum: 0.000000 2023-10-25 11:53:14,580 epoch 1 - iter 3610/3617 - loss 0.20214765 - time (sec): 226.27 - samples/sec: 1676.00 - lr: 0.000050 - momentum: 0.000000 2023-10-25 11:53:15,001 ---------------------------------------------------------------------------------------------------- 2023-10-25 11:53:15,001 EPOCH 1 done: loss 0.2021 - lr: 0.000050 2023-10-25 11:53:19,501 DEV : loss 0.14469869434833527 - f1-score (micro avg) 0.5759 2023-10-25 11:53:19,522 saving best model 2023-10-25 11:53:20,071 ---------------------------------------------------------------------------------------------------- 2023-10-25 11:53:42,599 epoch 2 - iter 361/3617 - loss 0.10828242 - time (sec): 22.53 - samples/sec: 1681.04 - lr: 0.000049 - momentum: 0.000000 2023-10-25 11:54:05,556 epoch 2 - iter 722/3617 - loss 0.11062343 - time (sec): 45.48 - samples/sec: 1692.62 - lr: 0.000049 - momentum: 0.000000 2023-10-25 11:54:28,432 epoch 2 - iter 1083/3617 - loss 0.11343990 - time (sec): 68.36 - samples/sec: 1689.62 - lr: 0.000048 - momentum: 0.000000 2023-10-25 11:54:51,191 epoch 2 - iter 1444/3617 - loss 0.11147450 - time (sec): 91.12 - samples/sec: 1693.49 - lr: 0.000048 - momentum: 0.000000 2023-10-25 11:55:13,719 epoch 2 - iter 1805/3617 - loss 0.11268183 - time (sec): 113.65 - samples/sec: 1686.21 - lr: 0.000047 - momentum: 0.000000 2023-10-25 11:55:36,251 epoch 2 - iter 2166/3617 - loss 0.11108706 - time (sec): 136.18 - samples/sec: 1685.34 - lr: 0.000047 - momentum: 0.000000 2023-10-25 11:55:58,771 epoch 2 - iter 2527/3617 - loss 0.10962742 - time (sec): 158.70 - samples/sec: 1678.37 - lr: 0.000046 - momentum: 0.000000 2023-10-25 11:56:21,330 epoch 2 - iter 2888/3617 - loss 0.10930890 - time (sec): 181.26 - samples/sec: 1677.39 - lr: 0.000046 - momentum: 0.000000 2023-10-25 11:56:43,852 epoch 2 - iter 3249/3617 - loss 0.10745796 - time (sec): 203.78 - samples/sec: 1678.49 - lr: 0.000045 - momentum: 0.000000 2023-10-25 11:57:06,613 epoch 2 - iter 3610/3617 - loss 0.10586077 - time (sec): 226.54 - samples/sec: 1673.92 - lr: 0.000044 - momentum: 0.000000 2023-10-25 11:57:07,071 ---------------------------------------------------------------------------------------------------- 2023-10-25 11:57:07,071 EPOCH 2 done: loss 0.1058 - lr: 0.000044 2023-10-25 11:57:12,310 DEV : loss 0.14430101215839386 - f1-score (micro avg) 0.6079 2023-10-25 11:57:12,332 saving best model 2023-10-25 11:57:13,111 ---------------------------------------------------------------------------------------------------- 2023-10-25 11:57:35,931 epoch 3 - iter 361/3617 - loss 0.07144153 - time (sec): 22.82 - samples/sec: 1734.60 - lr: 0.000044 - momentum: 0.000000 2023-10-25 11:57:58,787 epoch 3 - iter 722/3617 - loss 0.08074855 - time (sec): 45.68 - samples/sec: 1719.95 - lr: 0.000043 - momentum: 0.000000 2023-10-25 11:58:21,681 epoch 3 - iter 1083/3617 - loss 0.08263674 - time (sec): 68.57 - samples/sec: 1716.05 - lr: 0.000043 - momentum: 0.000000 2023-10-25 11:58:44,087 epoch 3 - iter 1444/3617 - loss 0.08288668 - time (sec): 90.97 - samples/sec: 1695.55 - lr: 0.000042 - momentum: 0.000000 2023-10-25 11:59:06,702 epoch 3 - iter 1805/3617 - loss 0.10363913 - time (sec): 113.59 - samples/sec: 1690.82 - lr: 0.000042 - momentum: 0.000000 2023-10-25 11:59:29,167 epoch 3 - iter 2166/3617 - loss 0.13603267 - time (sec): 136.06 - samples/sec: 1683.24 - lr: 0.000041 - momentum: 0.000000 2023-10-25 11:59:51,558 epoch 3 - iter 2527/3617 - loss 0.15873752 - time (sec): 158.45 - samples/sec: 1675.58 - lr: 0.000041 - momentum: 0.000000 2023-10-25 12:00:14,538 epoch 3 - iter 2888/3617 - loss 0.17453687 - time (sec): 181.43 - samples/sec: 1674.80 - lr: 0.000040 - momentum: 0.000000 2023-10-25 12:00:37,065 epoch 3 - iter 3249/3617 - loss 0.18855528 - time (sec): 203.95 - samples/sec: 1676.48 - lr: 0.000039 - momentum: 0.000000 2023-10-25 12:00:59,529 epoch 3 - iter 3610/3617 - loss 0.20118879 - time (sec): 226.42 - samples/sec: 1675.07 - lr: 0.000039 - momentum: 0.000000 2023-10-25 12:00:59,959 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:00:59,960 EPOCH 3 done: loss 0.2013 - lr: 0.000039 2023-10-25 12:01:05,163 DEV : loss 0.2843758761882782 - f1-score (micro avg) 0.0046 2023-10-25 12:01:05,185 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:01:27,882 epoch 4 - iter 361/3617 - loss 0.31486142 - time (sec): 22.70 - samples/sec: 1706.40 - lr: 0.000038 - momentum: 0.000000 2023-10-25 12:01:50,468 epoch 4 - iter 722/3617 - loss 0.30266314 - time (sec): 45.28 - samples/sec: 1686.83 - lr: 0.000038 - momentum: 0.000000 2023-10-25 12:02:13,127 epoch 4 - iter 1083/3617 - loss 0.30220104 - time (sec): 67.94 - samples/sec: 1689.02 - lr: 0.000037 - momentum: 0.000000 2023-10-25 12:02:35,756 epoch 4 - iter 1444/3617 - loss 0.29844936 - time (sec): 90.57 - samples/sec: 1670.74 - lr: 0.000037 - momentum: 0.000000 2023-10-25 12:02:58,524 epoch 4 - iter 1805/3617 - loss 0.29550689 - time (sec): 113.34 - samples/sec: 1672.63 - lr: 0.000036 - momentum: 0.000000 2023-10-25 12:03:21,138 epoch 4 - iter 2166/3617 - loss 0.29482135 - time (sec): 135.95 - samples/sec: 1663.58 - lr: 0.000036 - momentum: 0.000000 2023-10-25 12:03:43,665 epoch 4 - iter 2527/3617 - loss 0.29559234 - time (sec): 158.48 - samples/sec: 1663.63 - lr: 0.000035 - momentum: 0.000000 2023-10-25 12:04:06,283 epoch 4 - iter 2888/3617 - loss 0.29737787 - time (sec): 181.10 - samples/sec: 1660.70 - lr: 0.000034 - momentum: 0.000000 2023-10-25 12:04:29,216 epoch 4 - iter 3249/3617 - loss 0.29938044 - time (sec): 204.03 - samples/sec: 1667.11 - lr: 0.000034 - momentum: 0.000000 2023-10-25 12:04:52,016 epoch 4 - iter 3610/3617 - loss 0.29694305 - time (sec): 226.83 - samples/sec: 1672.49 - lr: 0.000033 - momentum: 0.000000 2023-10-25 12:04:52,435 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:04:52,436 EPOCH 4 done: loss 0.2968 - lr: 0.000033 2023-10-25 12:04:57,151 DEV : loss 0.2834990918636322 - f1-score (micro avg) 0.0023 2023-10-25 12:04:57,173 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:05:20,646 epoch 5 - iter 361/3617 - loss 0.30958497 - time (sec): 23.47 - samples/sec: 1681.65 - lr: 0.000033 - momentum: 0.000000 2023-10-25 12:05:43,246 epoch 5 - iter 722/3617 - loss 0.29814374 - time (sec): 46.07 - samples/sec: 1668.88 - lr: 0.000032 - momentum: 0.000000 2023-10-25 12:06:05,856 epoch 5 - iter 1083/3617 - loss 0.29063581 - time (sec): 68.68 - samples/sec: 1663.40 - lr: 0.000032 - momentum: 0.000000 2023-10-25 12:06:28,402 epoch 5 - iter 1444/3617 - loss 0.28726657 - time (sec): 91.23 - samples/sec: 1668.57 - lr: 0.000031 - momentum: 0.000000 2023-10-25 12:06:50,962 epoch 5 - iter 1805/3617 - loss 0.28881196 - time (sec): 113.79 - samples/sec: 1669.95 - lr: 0.000031 - momentum: 0.000000 2023-10-25 12:07:13,960 epoch 5 - iter 2166/3617 - loss 0.28967773 - time (sec): 136.79 - samples/sec: 1673.87 - lr: 0.000030 - momentum: 0.000000 2023-10-25 12:07:36,621 epoch 5 - iter 2527/3617 - loss 0.29134561 - time (sec): 159.45 - samples/sec: 1668.53 - lr: 0.000029 - momentum: 0.000000 2023-10-25 12:07:59,257 epoch 5 - iter 2888/3617 - loss 0.29414601 - time (sec): 182.08 - samples/sec: 1667.95 - lr: 0.000029 - momentum: 0.000000 2023-10-25 12:08:21,892 epoch 5 - iter 3249/3617 - loss 0.29373568 - time (sec): 204.72 - samples/sec: 1663.60 - lr: 0.000028 - momentum: 0.000000 2023-10-25 12:08:44,658 epoch 5 - iter 3610/3617 - loss 0.29343011 - time (sec): 227.48 - samples/sec: 1667.69 - lr: 0.000028 - momentum: 0.000000 2023-10-25 12:08:45,069 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:08:45,069 EPOCH 5 done: loss 0.2934 - lr: 0.000028 2023-10-25 12:08:49,763 DEV : loss 0.27164599299430847 - f1-score (micro avg) 0.0 2023-10-25 12:08:49,785 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:09:12,514 epoch 6 - iter 361/3617 - loss 0.31431851 - time (sec): 22.73 - samples/sec: 1671.84 - lr: 0.000027 - momentum: 0.000000 2023-10-25 12:09:35,378 epoch 6 - iter 722/3617 - loss 0.29910863 - time (sec): 45.59 - samples/sec: 1692.45 - lr: 0.000027 - momentum: 0.000000 2023-10-25 12:09:58,193 epoch 6 - iter 1083/3617 - loss 0.29460583 - time (sec): 68.41 - samples/sec: 1692.25 - lr: 0.000026 - momentum: 0.000000 2023-10-25 12:10:20,897 epoch 6 - iter 1444/3617 - loss 0.29799880 - time (sec): 91.11 - samples/sec: 1687.46 - lr: 0.000026 - momentum: 0.000000 2023-10-25 12:10:43,359 epoch 6 - iter 1805/3617 - loss 0.29509303 - time (sec): 113.57 - samples/sec: 1680.84 - lr: 0.000025 - momentum: 0.000000 2023-10-25 12:11:05,929 epoch 6 - iter 2166/3617 - loss 0.29227878 - time (sec): 136.14 - samples/sec: 1674.83 - lr: 0.000024 - momentum: 0.000000 2023-10-25 12:11:28,526 epoch 6 - iter 2527/3617 - loss 0.29220228 - time (sec): 158.74 - samples/sec: 1669.69 - lr: 0.000024 - momentum: 0.000000 2023-10-25 12:11:51,388 epoch 6 - iter 2888/3617 - loss 0.28976457 - time (sec): 181.60 - samples/sec: 1677.10 - lr: 0.000023 - momentum: 0.000000 2023-10-25 12:12:13,993 epoch 6 - iter 3249/3617 - loss 0.28949741 - time (sec): 204.21 - samples/sec: 1674.39 - lr: 0.000023 - momentum: 0.000000 2023-10-25 12:12:36,781 epoch 6 - iter 3610/3617 - loss 0.29047095 - time (sec): 226.99 - samples/sec: 1670.50 - lr: 0.000022 - momentum: 0.000000 2023-10-25 12:12:37,217 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:12:37,218 EPOCH 6 done: loss 0.2905 - lr: 0.000022 2023-10-25 12:12:42,446 DEV : loss 0.2741363048553467 - f1-score (micro avg) 0.0 2023-10-25 12:12:42,469 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:13:05,031 epoch 7 - iter 361/3617 - loss 0.28801601 - time (sec): 22.56 - samples/sec: 1668.20 - lr: 0.000022 - momentum: 0.000000 2023-10-25 12:13:27,676 epoch 7 - iter 722/3617 - loss 0.28684817 - time (sec): 45.21 - samples/sec: 1642.00 - lr: 0.000021 - momentum: 0.000000 2023-10-25 12:13:50,389 epoch 7 - iter 1083/3617 - loss 0.28911761 - time (sec): 67.92 - samples/sec: 1636.66 - lr: 0.000021 - momentum: 0.000000 2023-10-25 12:14:12,991 epoch 7 - iter 1444/3617 - loss 0.28494759 - time (sec): 90.52 - samples/sec: 1647.46 - lr: 0.000020 - momentum: 0.000000 2023-10-25 12:14:35,764 epoch 7 - iter 1805/3617 - loss 0.28299654 - time (sec): 113.29 - samples/sec: 1651.43 - lr: 0.000019 - momentum: 0.000000 2023-10-25 12:14:58,250 epoch 7 - iter 2166/3617 - loss 0.28544928 - time (sec): 135.78 - samples/sec: 1648.64 - lr: 0.000019 - momentum: 0.000000 2023-10-25 12:15:21,100 epoch 7 - iter 2527/3617 - loss 0.28500039 - time (sec): 158.63 - samples/sec: 1656.30 - lr: 0.000018 - momentum: 0.000000 2023-10-25 12:15:44,065 epoch 7 - iter 2888/3617 - loss 0.28630743 - time (sec): 181.60 - samples/sec: 1662.84 - lr: 0.000018 - momentum: 0.000000 2023-10-25 12:16:06,597 epoch 7 - iter 3249/3617 - loss 0.28966796 - time (sec): 204.13 - samples/sec: 1667.85 - lr: 0.000017 - momentum: 0.000000 2023-10-25 12:16:29,444 epoch 7 - iter 3610/3617 - loss 0.29008210 - time (sec): 226.97 - samples/sec: 1671.33 - lr: 0.000017 - momentum: 0.000000 2023-10-25 12:16:29,851 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:16:29,852 EPOCH 7 done: loss 0.2903 - lr: 0.000017 2023-10-25 12:16:35,059 DEV : loss 0.26774144172668457 - f1-score (micro avg) 0.0 2023-10-25 12:16:35,081 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:16:57,825 epoch 8 - iter 361/3617 - loss 0.27660386 - time (sec): 22.74 - samples/sec: 1704.22 - lr: 0.000016 - momentum: 0.000000 2023-10-25 12:17:20,392 epoch 8 - iter 722/3617 - loss 0.28079844 - time (sec): 45.31 - samples/sec: 1685.49 - lr: 0.000016 - momentum: 0.000000 2023-10-25 12:17:43,247 epoch 8 - iter 1083/3617 - loss 0.29020034 - time (sec): 68.17 - samples/sec: 1680.45 - lr: 0.000015 - momentum: 0.000000 2023-10-25 12:18:05,766 epoch 8 - iter 1444/3617 - loss 0.29429510 - time (sec): 90.68 - samples/sec: 1672.78 - lr: 0.000014 - momentum: 0.000000 2023-10-25 12:18:28,476 epoch 8 - iter 1805/3617 - loss 0.29349848 - time (sec): 113.39 - samples/sec: 1670.15 - lr: 0.000014 - momentum: 0.000000 2023-10-25 12:18:51,283 epoch 8 - iter 2166/3617 - loss 0.29446113 - time (sec): 136.20 - samples/sec: 1679.67 - lr: 0.000013 - momentum: 0.000000 2023-10-25 12:19:13,886 epoch 8 - iter 2527/3617 - loss 0.29023797 - time (sec): 158.80 - samples/sec: 1679.28 - lr: 0.000013 - momentum: 0.000000 2023-10-25 12:19:36,555 epoch 8 - iter 2888/3617 - loss 0.28871733 - time (sec): 181.47 - samples/sec: 1680.37 - lr: 0.000012 - momentum: 0.000000 2023-10-25 12:19:59,060 epoch 8 - iter 3249/3617 - loss 0.29028619 - time (sec): 203.98 - samples/sec: 1678.18 - lr: 0.000012 - momentum: 0.000000 2023-10-25 12:20:21,609 epoch 8 - iter 3610/3617 - loss 0.28929915 - time (sec): 226.53 - samples/sec: 1673.87 - lr: 0.000011 - momentum: 0.000000 2023-10-25 12:20:22,052 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:20:22,052 EPOCH 8 done: loss 0.2893 - lr: 0.000011 2023-10-25 12:20:27,261 DEV : loss 0.2751471996307373 - f1-score (micro avg) 0.0 2023-10-25 12:20:27,283 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:20:49,886 epoch 9 - iter 361/3617 - loss 0.30361453 - time (sec): 22.60 - samples/sec: 1645.83 - lr: 0.000011 - momentum: 0.000000 2023-10-25 12:21:12,574 epoch 9 - iter 722/3617 - loss 0.29195695 - time (sec): 45.29 - samples/sec: 1654.32 - lr: 0.000010 - momentum: 0.000000 2023-10-25 12:21:35,215 epoch 9 - iter 1083/3617 - loss 0.29457114 - time (sec): 67.93 - samples/sec: 1652.12 - lr: 0.000009 - momentum: 0.000000 2023-10-25 12:21:58,034 epoch 9 - iter 1444/3617 - loss 0.29407289 - time (sec): 90.75 - samples/sec: 1668.14 - lr: 0.000009 - momentum: 0.000000 2023-10-25 12:22:20,669 epoch 9 - iter 1805/3617 - loss 0.29048332 - time (sec): 113.39 - samples/sec: 1671.64 - lr: 0.000008 - momentum: 0.000000 2023-10-25 12:22:43,706 epoch 9 - iter 2166/3617 - loss 0.28277675 - time (sec): 136.42 - samples/sec: 1676.87 - lr: 0.000008 - momentum: 0.000000 2023-10-25 12:23:06,481 epoch 9 - iter 2527/3617 - loss 0.28719758 - time (sec): 159.20 - samples/sec: 1675.04 - lr: 0.000007 - momentum: 0.000000 2023-10-25 12:23:29,126 epoch 9 - iter 2888/3617 - loss 0.28670629 - time (sec): 181.84 - samples/sec: 1677.11 - lr: 0.000007 - momentum: 0.000000 2023-10-25 12:23:51,359 epoch 9 - iter 3249/3617 - loss 0.28859893 - time (sec): 204.08 - samples/sec: 1670.16 - lr: 0.000006 - momentum: 0.000000 2023-10-25 12:24:14,063 epoch 9 - iter 3610/3617 - loss 0.28826189 - time (sec): 226.78 - samples/sec: 1672.48 - lr: 0.000006 - momentum: 0.000000 2023-10-25 12:24:14,484 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:24:14,484 EPOCH 9 done: loss 0.2882 - lr: 0.000006 2023-10-25 12:24:19,699 DEV : loss 0.27169540524482727 - f1-score (micro avg) 0.0 2023-10-25 12:24:19,721 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:24:42,190 epoch 10 - iter 361/3617 - loss 0.26792150 - time (sec): 22.47 - samples/sec: 1653.50 - lr: 0.000005 - momentum: 0.000000 2023-10-25 12:25:05,071 epoch 10 - iter 722/3617 - loss 0.27746427 - time (sec): 45.35 - samples/sec: 1662.83 - lr: 0.000004 - momentum: 0.000000 2023-10-25 12:25:28,013 epoch 10 - iter 1083/3617 - loss 0.27477559 - time (sec): 68.29 - samples/sec: 1680.32 - lr: 0.000004 - momentum: 0.000000 2023-10-25 12:25:50,616 epoch 10 - iter 1444/3617 - loss 0.27361481 - time (sec): 90.89 - samples/sec: 1678.02 - lr: 0.000003 - momentum: 0.000000 2023-10-25 12:26:13,325 epoch 10 - iter 1805/3617 - loss 0.27593718 - time (sec): 113.60 - samples/sec: 1684.69 - lr: 0.000003 - momentum: 0.000000 2023-10-25 12:26:35,800 epoch 10 - iter 2166/3617 - loss 0.28020518 - time (sec): 136.08 - samples/sec: 1674.36 - lr: 0.000002 - momentum: 0.000000 2023-10-25 12:26:58,435 epoch 10 - iter 2527/3617 - loss 0.28002646 - time (sec): 158.71 - samples/sec: 1673.74 - lr: 0.000002 - momentum: 0.000000 2023-10-25 12:27:21,055 epoch 10 - iter 2888/3617 - loss 0.27933392 - time (sec): 181.33 - samples/sec: 1673.50 - lr: 0.000001 - momentum: 0.000000 2023-10-25 12:27:43,896 epoch 10 - iter 3249/3617 - loss 0.28251991 - time (sec): 204.17 - samples/sec: 1677.60 - lr: 0.000001 - momentum: 0.000000 2023-10-25 12:28:06,393 epoch 10 - iter 3610/3617 - loss 0.28721607 - time (sec): 226.67 - samples/sec: 1673.50 - lr: 0.000000 - momentum: 0.000000 2023-10-25 12:28:06,815 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:28:06,815 EPOCH 10 done: loss 0.2873 - lr: 0.000000 2023-10-25 12:28:11,509 DEV : loss 0.27294182777404785 - f1-score (micro avg) 0.0 2023-10-25 12:28:12,086 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:28:12,087 Loading model from best epoch ... 2023-10-25 12:28:13,838 SequenceTagger predicts: Dictionary with 13 tags: O, S-loc, B-loc, E-loc, I-loc, S-pers, B-pers, E-pers, I-pers, S-org, B-org, E-org, I-org 2023-10-25 12:28:20,073 Results: - F-score (micro) 0.639 - F-score (macro) 0.4364 - Accuracy 0.4803 By class: precision recall f1-score support loc 0.6601 0.7360 0.6960 591 pers 0.5473 0.6975 0.6133 357 org 0.0000 0.0000 0.0000 79 micro avg 0.6140 0.6660 0.6390 1027 macro avg 0.4024 0.4778 0.4364 1027 weighted avg 0.5701 0.6660 0.6137 1027 2023-10-25 12:28:20,073 ----------------------------------------------------------------------------------------------------