stefan-it's picture
Upload ./training.log with huggingface_hub
cac38cb
raw
history blame
36.9 kB
2023-10-25 12:56:32,932 ----------------------------------------------------------------------------------------------------
2023-10-25 12:56:32,933 Model: "SequenceTagger(
(embeddings): TransformerWordEmbeddings(
(model): BertModel(
(embeddings): BertEmbeddings(
(word_embeddings): Embedding(64001, 768)
(position_embeddings): Embedding(512, 768)
(token_type_embeddings): Embedding(2, 768)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(encoder): BertEncoder(
(layer): ModuleList(
(0): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(1): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(2): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(3): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(4): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(5): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(6): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(7): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(8): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(9): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(10): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(11): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
)
(pooler): BertPooler(
(dense): Linear(in_features=768, out_features=768, bias=True)
(activation): Tanh()
)
)
)
(locked_dropout): LockedDropout(p=0.5)
(linear): Linear(in_features=768, out_features=13, bias=True)
(loss_function): CrossEntropyLoss()
)"
2023-10-25 12:56:32,933 ----------------------------------------------------------------------------------------------------
2023-10-25 12:56:32,934 MultiCorpus: 14465 train + 1392 dev + 2432 test sentences
- NER_HIPE_2022 Corpus: 14465 train + 1392 dev + 2432 test sentences - /home/ubuntu/.flair/datasets/ner_hipe_2022/v2.1/letemps/fr/with_doc_seperator
2023-10-25 12:56:32,934 ----------------------------------------------------------------------------------------------------
2023-10-25 12:56:32,934 Train: 14465 sentences
2023-10-25 12:56:32,934 (train_with_dev=False, train_with_test=False)
2023-10-25 12:56:32,934 ----------------------------------------------------------------------------------------------------
2023-10-25 12:56:32,934 Training Params:
2023-10-25 12:56:32,934 - learning_rate: "5e-05"
2023-10-25 12:56:32,934 - mini_batch_size: "8"
2023-10-25 12:56:32,934 - max_epochs: "10"
2023-10-25 12:56:32,934 - shuffle: "True"
2023-10-25 12:56:32,934 ----------------------------------------------------------------------------------------------------
2023-10-25 12:56:32,934 Plugins:
2023-10-25 12:56:32,934 - TensorboardLogger
2023-10-25 12:56:32,934 - LinearScheduler | warmup_fraction: '0.1'
2023-10-25 12:56:32,934 ----------------------------------------------------------------------------------------------------
2023-10-25 12:56:32,934 Final evaluation on model from best epoch (best-model.pt)
2023-10-25 12:56:32,934 - metric: "('micro avg', 'f1-score')"
2023-10-25 12:56:32,934 ----------------------------------------------------------------------------------------------------
2023-10-25 12:56:32,934 Computation:
2023-10-25 12:56:32,934 - compute on device: cuda:0
2023-10-25 12:56:32,934 - embedding storage: none
2023-10-25 12:56:32,934 ----------------------------------------------------------------------------------------------------
2023-10-25 12:56:32,934 Model training base path: "hmbench-letemps/fr-dbmdz/bert-base-historic-multilingual-64k-td-cased-bs8-wsFalse-e10-lr5e-05-poolingfirst-layers-1-crfFalse-3"
2023-10-25 12:56:32,934 ----------------------------------------------------------------------------------------------------
2023-10-25 12:56:32,934 ----------------------------------------------------------------------------------------------------
2023-10-25 12:56:32,934 Logging anything other than scalars to TensorBoard is currently not supported.
2023-10-25 12:56:48,346 epoch 1 - iter 180/1809 - loss 1.10006084 - time (sec): 15.41 - samples/sec: 2402.62 - lr: 0.000005 - momentum: 0.000000
2023-10-25 12:57:04,474 epoch 1 - iter 360/1809 - loss 0.63052655 - time (sec): 31.54 - samples/sec: 2436.05 - lr: 0.000010 - momentum: 0.000000
2023-10-25 12:57:20,154 epoch 1 - iter 540/1809 - loss 0.47100990 - time (sec): 47.22 - samples/sec: 2427.01 - lr: 0.000015 - momentum: 0.000000
2023-10-25 12:57:35,999 epoch 1 - iter 720/1809 - loss 0.38400724 - time (sec): 63.06 - samples/sec: 2427.43 - lr: 0.000020 - momentum: 0.000000
2023-10-25 12:57:51,832 epoch 1 - iter 900/1809 - loss 0.33257800 - time (sec): 78.90 - samples/sec: 2412.15 - lr: 0.000025 - momentum: 0.000000
2023-10-25 12:58:07,695 epoch 1 - iter 1080/1809 - loss 0.29578277 - time (sec): 94.76 - samples/sec: 2405.28 - lr: 0.000030 - momentum: 0.000000
2023-10-25 12:58:23,282 epoch 1 - iter 1260/1809 - loss 0.26811077 - time (sec): 110.35 - samples/sec: 2407.63 - lr: 0.000035 - momentum: 0.000000
2023-10-25 12:58:39,159 epoch 1 - iter 1440/1809 - loss 0.24867204 - time (sec): 126.22 - samples/sec: 2401.15 - lr: 0.000040 - momentum: 0.000000
2023-10-25 12:58:54,972 epoch 1 - iter 1620/1809 - loss 0.23254968 - time (sec): 142.04 - samples/sec: 2394.45 - lr: 0.000045 - momentum: 0.000000
2023-10-25 12:59:10,963 epoch 1 - iter 1800/1809 - loss 0.21944741 - time (sec): 158.03 - samples/sec: 2393.36 - lr: 0.000050 - momentum: 0.000000
2023-10-25 12:59:11,727 ----------------------------------------------------------------------------------------------------
2023-10-25 12:59:11,727 EPOCH 1 done: loss 0.2190 - lr: 0.000050
2023-10-25 12:59:16,227 DEV : loss 0.0940776988863945 - f1-score (micro avg) 0.5283
2023-10-25 12:59:16,250 saving best model
2023-10-25 12:59:16,804 ----------------------------------------------------------------------------------------------------
2023-10-25 12:59:32,156 epoch 2 - iter 180/1809 - loss 0.07972278 - time (sec): 15.35 - samples/sec: 2377.73 - lr: 0.000049 - momentum: 0.000000
2023-10-25 12:59:48,323 epoch 2 - iter 360/1809 - loss 0.08152052 - time (sec): 31.52 - samples/sec: 2340.68 - lr: 0.000049 - momentum: 0.000000
2023-10-25 13:00:04,570 epoch 2 - iter 540/1809 - loss 0.08505170 - time (sec): 47.77 - samples/sec: 2366.58 - lr: 0.000048 - momentum: 0.000000
2023-10-25 13:00:20,330 epoch 2 - iter 720/1809 - loss 0.08594619 - time (sec): 63.53 - samples/sec: 2376.02 - lr: 0.000048 - momentum: 0.000000
2023-10-25 13:00:36,207 epoch 2 - iter 900/1809 - loss 0.08806352 - time (sec): 79.40 - samples/sec: 2380.43 - lr: 0.000047 - momentum: 0.000000
2023-10-25 13:00:51,928 epoch 2 - iter 1080/1809 - loss 0.08857237 - time (sec): 95.12 - samples/sec: 2379.41 - lr: 0.000047 - momentum: 0.000000
2023-10-25 13:01:07,495 epoch 2 - iter 1260/1809 - loss 0.08718027 - time (sec): 110.69 - samples/sec: 2386.51 - lr: 0.000046 - momentum: 0.000000
2023-10-25 13:01:23,189 epoch 2 - iter 1440/1809 - loss 0.08666178 - time (sec): 126.38 - samples/sec: 2392.20 - lr: 0.000046 - momentum: 0.000000
2023-10-25 13:01:38,806 epoch 2 - iter 1620/1809 - loss 0.08537196 - time (sec): 142.00 - samples/sec: 2395.91 - lr: 0.000045 - momentum: 0.000000
2023-10-25 13:01:55,003 epoch 2 - iter 1800/1809 - loss 0.08624945 - time (sec): 158.20 - samples/sec: 2390.41 - lr: 0.000044 - momentum: 0.000000
2023-10-25 13:01:55,826 ----------------------------------------------------------------------------------------------------
2023-10-25 13:01:55,826 EPOCH 2 done: loss 0.0862 - lr: 0.000044
2023-10-25 13:02:01,076 DEV : loss 0.13432565331459045 - f1-score (micro avg) 0.6025
2023-10-25 13:02:01,098 saving best model
2023-10-25 13:02:01,754 ----------------------------------------------------------------------------------------------------
2023-10-25 13:02:17,323 epoch 3 - iter 180/1809 - loss 0.05779259 - time (sec): 15.57 - samples/sec: 2359.77 - lr: 0.000044 - momentum: 0.000000
2023-10-25 13:02:33,718 epoch 3 - iter 360/1809 - loss 0.05644142 - time (sec): 31.96 - samples/sec: 2360.99 - lr: 0.000043 - momentum: 0.000000
2023-10-25 13:02:49,700 epoch 3 - iter 540/1809 - loss 0.06025166 - time (sec): 47.94 - samples/sec: 2376.32 - lr: 0.000043 - momentum: 0.000000
2023-10-25 13:03:05,515 epoch 3 - iter 720/1809 - loss 0.05963981 - time (sec): 63.76 - samples/sec: 2398.85 - lr: 0.000042 - momentum: 0.000000
2023-10-25 13:03:21,210 epoch 3 - iter 900/1809 - loss 0.05988365 - time (sec): 79.45 - samples/sec: 2391.83 - lr: 0.000042 - momentum: 0.000000
2023-10-25 13:03:37,246 epoch 3 - iter 1080/1809 - loss 0.06110576 - time (sec): 95.49 - samples/sec: 2395.55 - lr: 0.000041 - momentum: 0.000000
2023-10-25 13:03:53,081 epoch 3 - iter 1260/1809 - loss 0.05959679 - time (sec): 111.33 - samples/sec: 2397.88 - lr: 0.000041 - momentum: 0.000000
2023-10-25 13:04:08,904 epoch 3 - iter 1440/1809 - loss 0.05990670 - time (sec): 127.15 - samples/sec: 2392.99 - lr: 0.000040 - momentum: 0.000000
2023-10-25 13:04:24,144 epoch 3 - iter 1620/1809 - loss 0.05964992 - time (sec): 142.39 - samples/sec: 2380.45 - lr: 0.000039 - momentum: 0.000000
2023-10-25 13:04:40,347 epoch 3 - iter 1800/1809 - loss 0.06114775 - time (sec): 158.59 - samples/sec: 2382.72 - lr: 0.000039 - momentum: 0.000000
2023-10-25 13:04:41,213 ----------------------------------------------------------------------------------------------------
2023-10-25 13:04:41,213 EPOCH 3 done: loss 0.0611 - lr: 0.000039
2023-10-25 13:04:46,486 DEV : loss 0.1459859311580658 - f1-score (micro avg) 0.6574
2023-10-25 13:04:46,509 saving best model
2023-10-25 13:04:47,256 ----------------------------------------------------------------------------------------------------
2023-10-25 13:05:03,087 epoch 4 - iter 180/1809 - loss 0.03569500 - time (sec): 15.83 - samples/sec: 2388.59 - lr: 0.000038 - momentum: 0.000000
2023-10-25 13:05:18,721 epoch 4 - iter 360/1809 - loss 0.03919591 - time (sec): 31.46 - samples/sec: 2389.76 - lr: 0.000038 - momentum: 0.000000
2023-10-25 13:05:34,881 epoch 4 - iter 540/1809 - loss 0.04102332 - time (sec): 47.62 - samples/sec: 2380.28 - lr: 0.000037 - momentum: 0.000000
2023-10-25 13:05:50,661 epoch 4 - iter 720/1809 - loss 0.04009188 - time (sec): 63.40 - samples/sec: 2377.07 - lr: 0.000037 - momentum: 0.000000
2023-10-25 13:06:06,635 epoch 4 - iter 900/1809 - loss 0.04101052 - time (sec): 79.38 - samples/sec: 2383.76 - lr: 0.000036 - momentum: 0.000000
2023-10-25 13:06:22,324 epoch 4 - iter 1080/1809 - loss 0.04250899 - time (sec): 95.07 - samples/sec: 2389.95 - lr: 0.000036 - momentum: 0.000000
2023-10-25 13:06:37,963 epoch 4 - iter 1260/1809 - loss 0.04333909 - time (sec): 110.71 - samples/sec: 2386.49 - lr: 0.000035 - momentum: 0.000000
2023-10-25 13:06:53,854 epoch 4 - iter 1440/1809 - loss 0.04374822 - time (sec): 126.60 - samples/sec: 2380.16 - lr: 0.000034 - momentum: 0.000000
2023-10-25 13:07:09,575 epoch 4 - iter 1620/1809 - loss 0.04432814 - time (sec): 142.32 - samples/sec: 2380.91 - lr: 0.000034 - momentum: 0.000000
2023-10-25 13:07:25,700 epoch 4 - iter 1800/1809 - loss 0.04524228 - time (sec): 158.44 - samples/sec: 2383.85 - lr: 0.000033 - momentum: 0.000000
2023-10-25 13:07:26,573 ----------------------------------------------------------------------------------------------------
2023-10-25 13:07:26,573 EPOCH 4 done: loss 0.0451 - lr: 0.000033
2023-10-25 13:07:31,848 DEV : loss 0.20192305743694305 - f1-score (micro avg) 0.6289
2023-10-25 13:07:31,871 ----------------------------------------------------------------------------------------------------
2023-10-25 13:07:47,452 epoch 5 - iter 180/1809 - loss 0.02734931 - time (sec): 15.58 - samples/sec: 2412.22 - lr: 0.000033 - momentum: 0.000000
2023-10-25 13:08:03,819 epoch 5 - iter 360/1809 - loss 0.02324021 - time (sec): 31.95 - samples/sec: 2392.25 - lr: 0.000032 - momentum: 0.000000
2023-10-25 13:08:19,580 epoch 5 - iter 540/1809 - loss 0.02650141 - time (sec): 47.71 - samples/sec: 2403.80 - lr: 0.000032 - momentum: 0.000000
2023-10-25 13:08:35,226 epoch 5 - iter 720/1809 - loss 0.02704811 - time (sec): 63.35 - samples/sec: 2417.63 - lr: 0.000031 - momentum: 0.000000
2023-10-25 13:08:51,415 epoch 5 - iter 900/1809 - loss 0.02877765 - time (sec): 79.54 - samples/sec: 2406.90 - lr: 0.000031 - momentum: 0.000000
2023-10-25 13:09:07,224 epoch 5 - iter 1080/1809 - loss 0.02961531 - time (sec): 95.35 - samples/sec: 2400.83 - lr: 0.000030 - momentum: 0.000000
2023-10-25 13:09:23,022 epoch 5 - iter 1260/1809 - loss 0.02938927 - time (sec): 111.15 - samples/sec: 2396.43 - lr: 0.000029 - momentum: 0.000000
2023-10-25 13:09:38,603 epoch 5 - iter 1440/1809 - loss 0.02946213 - time (sec): 126.73 - samples/sec: 2401.26 - lr: 0.000029 - momentum: 0.000000
2023-10-25 13:09:54,292 epoch 5 - iter 1620/1809 - loss 0.02947805 - time (sec): 142.42 - samples/sec: 2396.84 - lr: 0.000028 - momentum: 0.000000
2023-10-25 13:10:10,223 epoch 5 - iter 1800/1809 - loss 0.02955292 - time (sec): 158.35 - samples/sec: 2389.40 - lr: 0.000028 - momentum: 0.000000
2023-10-25 13:10:10,953 ----------------------------------------------------------------------------------------------------
2023-10-25 13:10:10,953 EPOCH 5 done: loss 0.0295 - lr: 0.000028
2023-10-25 13:10:15,727 DEV : loss 0.2949555218219757 - f1-score (micro avg) 0.6355
2023-10-25 13:10:15,750 ----------------------------------------------------------------------------------------------------
2023-10-25 13:10:31,928 epoch 6 - iter 180/1809 - loss 0.01655256 - time (sec): 16.18 - samples/sec: 2405.34 - lr: 0.000027 - momentum: 0.000000
2023-10-25 13:10:47,816 epoch 6 - iter 360/1809 - loss 0.01946603 - time (sec): 32.07 - samples/sec: 2373.06 - lr: 0.000027 - momentum: 0.000000
2023-10-25 13:11:03,540 epoch 6 - iter 540/1809 - loss 0.01771531 - time (sec): 47.79 - samples/sec: 2366.90 - lr: 0.000026 - momentum: 0.000000
2023-10-25 13:11:19,762 epoch 6 - iter 720/1809 - loss 0.01794652 - time (sec): 64.01 - samples/sec: 2376.16 - lr: 0.000026 - momentum: 0.000000
2023-10-25 13:11:35,513 epoch 6 - iter 900/1809 - loss 0.01902434 - time (sec): 79.76 - samples/sec: 2373.75 - lr: 0.000025 - momentum: 0.000000
2023-10-25 13:11:51,427 epoch 6 - iter 1080/1809 - loss 0.01867401 - time (sec): 95.68 - samples/sec: 2377.37 - lr: 0.000024 - momentum: 0.000000
2023-10-25 13:12:07,127 epoch 6 - iter 1260/1809 - loss 0.01897470 - time (sec): 111.38 - samples/sec: 2382.57 - lr: 0.000024 - momentum: 0.000000
2023-10-25 13:12:23,192 epoch 6 - iter 1440/1809 - loss 0.01911851 - time (sec): 127.44 - samples/sec: 2384.79 - lr: 0.000023 - momentum: 0.000000
2023-10-25 13:12:39,025 epoch 6 - iter 1620/1809 - loss 0.01999373 - time (sec): 143.27 - samples/sec: 2382.18 - lr: 0.000023 - momentum: 0.000000
2023-10-25 13:12:54,521 epoch 6 - iter 1800/1809 - loss 0.01982448 - time (sec): 158.77 - samples/sec: 2381.69 - lr: 0.000022 - momentum: 0.000000
2023-10-25 13:12:55,269 ----------------------------------------------------------------------------------------------------
2023-10-25 13:12:55,269 EPOCH 6 done: loss 0.0198 - lr: 0.000022
2023-10-25 13:13:00,034 DEV : loss 0.347699373960495 - f1-score (micro avg) 0.6493
2023-10-25 13:13:00,057 ----------------------------------------------------------------------------------------------------
2023-10-25 13:13:15,623 epoch 7 - iter 180/1809 - loss 0.01012341 - time (sec): 15.57 - samples/sec: 2404.35 - lr: 0.000022 - momentum: 0.000000
2023-10-25 13:13:31,454 epoch 7 - iter 360/1809 - loss 0.01305697 - time (sec): 31.40 - samples/sec: 2373.59 - lr: 0.000021 - momentum: 0.000000
2023-10-25 13:13:47,017 epoch 7 - iter 540/1809 - loss 0.01300877 - time (sec): 46.96 - samples/sec: 2377.11 - lr: 0.000021 - momentum: 0.000000
2023-10-25 13:14:02,889 epoch 7 - iter 720/1809 - loss 0.01389528 - time (sec): 62.83 - samples/sec: 2376.06 - lr: 0.000020 - momentum: 0.000000
2023-10-25 13:14:18,735 epoch 7 - iter 900/1809 - loss 0.01408907 - time (sec): 78.68 - samples/sec: 2375.30 - lr: 0.000019 - momentum: 0.000000
2023-10-25 13:14:34,859 epoch 7 - iter 1080/1809 - loss 0.01370295 - time (sec): 94.80 - samples/sec: 2382.48 - lr: 0.000019 - momentum: 0.000000
2023-10-25 13:14:50,754 epoch 7 - iter 1260/1809 - loss 0.01366540 - time (sec): 110.70 - samples/sec: 2392.99 - lr: 0.000018 - momentum: 0.000000
2023-10-25 13:15:06,865 epoch 7 - iter 1440/1809 - loss 0.01400641 - time (sec): 126.81 - samples/sec: 2390.49 - lr: 0.000018 - momentum: 0.000000
2023-10-25 13:15:22,780 epoch 7 - iter 1620/1809 - loss 0.01385209 - time (sec): 142.72 - samples/sec: 2385.21 - lr: 0.000017 - momentum: 0.000000
2023-10-25 13:15:38,684 epoch 7 - iter 1800/1809 - loss 0.01412168 - time (sec): 158.63 - samples/sec: 2383.30 - lr: 0.000017 - momentum: 0.000000
2023-10-25 13:15:39,407 ----------------------------------------------------------------------------------------------------
2023-10-25 13:15:39,408 EPOCH 7 done: loss 0.0141 - lr: 0.000017
2023-10-25 13:15:44,698 DEV : loss 0.35314860939979553 - f1-score (micro avg) 0.6557
2023-10-25 13:15:44,721 ----------------------------------------------------------------------------------------------------
2023-10-25 13:16:00,867 epoch 8 - iter 180/1809 - loss 0.01022784 - time (sec): 16.14 - samples/sec: 2412.56 - lr: 0.000016 - momentum: 0.000000
2023-10-25 13:16:17,042 epoch 8 - iter 360/1809 - loss 0.01058094 - time (sec): 32.32 - samples/sec: 2352.04 - lr: 0.000016 - momentum: 0.000000
2023-10-25 13:16:33,062 epoch 8 - iter 540/1809 - loss 0.01011424 - time (sec): 48.34 - samples/sec: 2361.84 - lr: 0.000015 - momentum: 0.000000
2023-10-25 13:16:49,047 epoch 8 - iter 720/1809 - loss 0.00966851 - time (sec): 64.32 - samples/sec: 2375.94 - lr: 0.000014 - momentum: 0.000000
2023-10-25 13:17:04,894 epoch 8 - iter 900/1809 - loss 0.00933864 - time (sec): 80.17 - samples/sec: 2376.95 - lr: 0.000014 - momentum: 0.000000
2023-10-25 13:17:20,529 epoch 8 - iter 1080/1809 - loss 0.00951050 - time (sec): 95.81 - samples/sec: 2364.76 - lr: 0.000013 - momentum: 0.000000
2023-10-25 13:17:36,142 epoch 8 - iter 1260/1809 - loss 0.00966767 - time (sec): 111.42 - samples/sec: 2370.92 - lr: 0.000013 - momentum: 0.000000
2023-10-25 13:17:52,143 epoch 8 - iter 1440/1809 - loss 0.00962221 - time (sec): 127.42 - samples/sec: 2379.62 - lr: 0.000012 - momentum: 0.000000
2023-10-25 13:18:07,519 epoch 8 - iter 1620/1809 - loss 0.00957778 - time (sec): 142.80 - samples/sec: 2379.27 - lr: 0.000012 - momentum: 0.000000
2023-10-25 13:18:23,395 epoch 8 - iter 1800/1809 - loss 0.00960260 - time (sec): 158.67 - samples/sec: 2382.90 - lr: 0.000011 - momentum: 0.000000
2023-10-25 13:18:24,169 ----------------------------------------------------------------------------------------------------
2023-10-25 13:18:24,169 EPOCH 8 done: loss 0.0096 - lr: 0.000011
2023-10-25 13:18:29,463 DEV : loss 0.4076786935329437 - f1-score (micro avg) 0.6491
2023-10-25 13:18:29,486 ----------------------------------------------------------------------------------------------------
2023-10-25 13:18:44,946 epoch 9 - iter 180/1809 - loss 0.00346298 - time (sec): 15.46 - samples/sec: 2391.49 - lr: 0.000011 - momentum: 0.000000
2023-10-25 13:19:01,070 epoch 9 - iter 360/1809 - loss 0.00386564 - time (sec): 31.58 - samples/sec: 2399.66 - lr: 0.000010 - momentum: 0.000000
2023-10-25 13:19:16,951 epoch 9 - iter 540/1809 - loss 0.00562997 - time (sec): 47.46 - samples/sec: 2400.68 - lr: 0.000009 - momentum: 0.000000
2023-10-25 13:19:32,422 epoch 9 - iter 720/1809 - loss 0.00546924 - time (sec): 62.94 - samples/sec: 2394.18 - lr: 0.000009 - momentum: 0.000000
2023-10-25 13:19:48,301 epoch 9 - iter 900/1809 - loss 0.00581289 - time (sec): 78.81 - samples/sec: 2392.02 - lr: 0.000008 - momentum: 0.000000
2023-10-25 13:20:04,825 epoch 9 - iter 1080/1809 - loss 0.00611858 - time (sec): 95.34 - samples/sec: 2384.94 - lr: 0.000008 - momentum: 0.000000
2023-10-25 13:20:21,183 epoch 9 - iter 1260/1809 - loss 0.00643895 - time (sec): 111.70 - samples/sec: 2378.78 - lr: 0.000007 - momentum: 0.000000
2023-10-25 13:20:37,254 epoch 9 - iter 1440/1809 - loss 0.00648270 - time (sec): 127.77 - samples/sec: 2383.17 - lr: 0.000007 - momentum: 0.000000
2023-10-25 13:20:52,352 epoch 9 - iter 1620/1809 - loss 0.00637081 - time (sec): 142.86 - samples/sec: 2377.49 - lr: 0.000006 - momentum: 0.000000
2023-10-25 13:21:08,288 epoch 9 - iter 1800/1809 - loss 0.00619420 - time (sec): 158.80 - samples/sec: 2382.08 - lr: 0.000006 - momentum: 0.000000
2023-10-25 13:21:09,058 ----------------------------------------------------------------------------------------------------
2023-10-25 13:21:09,058 EPOCH 9 done: loss 0.0062 - lr: 0.000006
2023-10-25 13:21:14,367 DEV : loss 0.4059355556964874 - f1-score (micro avg) 0.6474
2023-10-25 13:21:14,390 ----------------------------------------------------------------------------------------------------
2023-10-25 13:21:30,210 epoch 10 - iter 180/1809 - loss 0.00238887 - time (sec): 15.82 - samples/sec: 2406.56 - lr: 0.000005 - momentum: 0.000000
2023-10-25 13:21:45,960 epoch 10 - iter 360/1809 - loss 0.00251156 - time (sec): 31.57 - samples/sec: 2405.41 - lr: 0.000004 - momentum: 0.000000
2023-10-25 13:22:01,793 epoch 10 - iter 540/1809 - loss 0.00290608 - time (sec): 47.40 - samples/sec: 2395.50 - lr: 0.000004 - momentum: 0.000000
2023-10-25 13:22:17,530 epoch 10 - iter 720/1809 - loss 0.00315655 - time (sec): 63.14 - samples/sec: 2387.64 - lr: 0.000003 - momentum: 0.000000
2023-10-25 13:22:33,117 epoch 10 - iter 900/1809 - loss 0.00334403 - time (sec): 78.73 - samples/sec: 2379.43 - lr: 0.000003 - momentum: 0.000000
2023-10-25 13:22:48,770 epoch 10 - iter 1080/1809 - loss 0.00312517 - time (sec): 94.38 - samples/sec: 2380.87 - lr: 0.000002 - momentum: 0.000000
2023-10-25 13:23:04,901 epoch 10 - iter 1260/1809 - loss 0.00323274 - time (sec): 110.51 - samples/sec: 2383.07 - lr: 0.000002 - momentum: 0.000000
2023-10-25 13:23:21,257 epoch 10 - iter 1440/1809 - loss 0.00317762 - time (sec): 126.87 - samples/sec: 2385.06 - lr: 0.000001 - momentum: 0.000000
2023-10-25 13:23:37,428 epoch 10 - iter 1620/1809 - loss 0.00332409 - time (sec): 143.04 - samples/sec: 2383.07 - lr: 0.000001 - momentum: 0.000000
2023-10-25 13:23:52,987 epoch 10 - iter 1800/1809 - loss 0.00349659 - time (sec): 158.60 - samples/sec: 2384.70 - lr: 0.000000 - momentum: 0.000000
2023-10-25 13:23:53,696 ----------------------------------------------------------------------------------------------------
2023-10-25 13:23:53,697 EPOCH 10 done: loss 0.0035 - lr: 0.000000
2023-10-25 13:23:59,011 DEV : loss 0.4234822392463684 - f1-score (micro avg) 0.6419
2023-10-25 13:23:59,603 ----------------------------------------------------------------------------------------------------
2023-10-25 13:23:59,604 Loading model from best epoch ...
2023-10-25 13:24:01,370 SequenceTagger predicts: Dictionary with 13 tags: O, S-loc, B-loc, E-loc, I-loc, S-pers, B-pers, E-pers, I-pers, S-org, B-org, E-org, I-org
2023-10-25 13:24:07,107
Results:
- F-score (micro) 0.6591
- F-score (macro) 0.4663
- Accuracy 0.5014
By class:
precision recall f1-score support
loc 0.6863 0.7479 0.7158 591
pers 0.5734 0.7115 0.6350 357
org 0.5000 0.0253 0.0482 79
micro avg 0.6398 0.6796 0.6591 1027
macro avg 0.5866 0.4949 0.4663 1027
weighted avg 0.6327 0.6796 0.6364 1027
2023-10-25 13:24:07,107 ----------------------------------------------------------------------------------------------------