Marvin commited on
Commit
36f2b9a
0 Parent(s):

Initial commit

Browse files
.gitattributes ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
2
+ *.model filter=lfs diff=lfs merge=lfs -text
3
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
4
+ runs/** filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - de
4
+ tags:
5
+ - question-generation
6
+ - german
7
+ - text2text-generation
8
+ - generated_from_trainer
9
+ datasets:
10
+ - lmqg/qg_dequad
11
+ metrics:
12
+ - bleu4
13
+ - f1
14
+ - rouge
15
+ - exact_match
16
+ model-index:
17
+ - name: german-jeopardy-longt5-base-256
18
+ results:
19
+ - task:
20
+ name: Sequence-to-sequence Language Modeling
21
+ type: text2text-generation
22
+ dataset:
23
+ name: lmqg/qg_dequad
24
+ type: default
25
+ args: default
26
+ metrics:
27
+ - name: BLEU-4
28
+ type: bleu4
29
+ value: 10.52
30
+ - name: F1
31
+ type: f1
32
+ value: 33.92
33
+ - name: ROUGE-1
34
+ type: rouge1
35
+ value: 34.80
36
+ - name: ROUGE-2
37
+ type: rouge2
38
+ value: 16.54
39
+ - name: ROUGE-L
40
+ type: rougel
41
+ value: 33.69
42
+ - name: ROUGE-Lsum
43
+ type: rougelsum
44
+ value: 33.70
45
+ - name: Exact Match
46
+ type: exact_match
47
+ value: 1.50
48
+ ---
49
+
50
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
51
+ should probably proofread and complete it, then remove this comment. -->
52
+
53
+ # german-jeopardy-longt5-base-256
54
+
55
+ This model is a fine-tuned version of [google/long-t5-tglobal-base](https://huggingface.co/google/long-t5-tglobal-base) on the [lmqg/qg_dequad](https://huggingface.co/datasets/lmqg/qg_dequad) dataset.
56
+ It achieves the following results on the evaluation set:
57
+ - Loss: 1.7833
58
+ - Brevity Penalty: 0.8244
59
+ - System Length: 17427
60
+ - Reference Length: 20793
61
+ - ROUGE-1: 34.80
62
+ - ROUGE-2: 16.54
63
+ - ROUGE-L: 33.69
64
+ - ROUGE-Lsum: 33.70
65
+ - Exact Match: 1.50
66
+ - BLEU: 10.52
67
+ - F1: 33.92
68
+
69
+ ## Model description
70
+
71
+ See [google/long-t5-tglobal-base](https://huggingface.co/google/long-t5-tglobal-base) for more information about the
72
+ model architecture.
73
+ The model was trained on a single NVIDIA RTX 3090 GPU with 24GB of VRAM.
74
+
75
+ ## Intended uses & limitations
76
+
77
+ This model can be used for question generation on German text.
78
+
79
+ ## Training and evaluation data
80
+
81
+ See [lmqg/qg_dequad](https://huggingface.co/datasets/lmqg/qg_dequad).
82
+
83
+ ## Training procedure
84
+
85
+ ### Training hyperparameters
86
+
87
+ The following hyperparameters were used during training:
88
+ - learning_rate: 0.0001
89
+ - train_batch_size: 8
90
+ - eval_batch_size: 4
91
+ - seed: 7
92
+ - gradient_accumulation_steps: 32
93
+ - total_train_batch_size: 256
94
+ - optimizer: Adafactor
95
+ - lr_scheduler_type: constant
96
+ - num_epochs: 20
97
+
98
+ ### Training results
99
+
100
+ | Training Loss | Epoch | Step | Validation Loss | Counts 1 | Counts 2 | Counts 3 | Counts 4 | Totals 1 | Totals 2 | Totals 3 | Totals 4 | Precisions 1 | Precisions 2 | Precisions 3 | Precisions 4 | Brevity Penalty | System Length | Reference Length | ROUGE-1 | ROUGE-2 | ROUGE-L | ROUGE-Lsum | Exact Match | BLEU | Mean Generated Length | F1 |
101
+ |:-------------:|:-----:|:----:|:---------------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:------------:|:------------:|:------------:|:------------:|:---------------:|:-------------:|:----------------:|:-------:|:-------:|:-------:|:----------:|:-----------:|:-------:|:---------------------:|:------:|
102
+ | 3.6024 | 0.99 | 36 | 2.4682 | 5645 | 1343 | 424 | 109 | 15388 | 13184 | 10980 | 8776 | 36.6844 | 10.1866 | 3.8616 | 1.242 | 0.6832 | 15388 | 21250 | 0.2285 | 0.0824 | 0.2192 | 0.2188 | 0.0005 | 4.4454 | 11.6338 | 0.2236 |
103
+ | 2.9671 | 1.98 | 72 | 2.2445 | 5988 | 1562 | 569 | 179 | 16094 | 13890 | 11686 | 9482 | 37.2064 | 11.2455 | 4.8691 | 1.8878 | 0.7259 | 16094 | 21250 | 0.2465 | 0.0971 | 0.2371 | 0.2371 | 0.0018 | 5.7163 | 12.314 | 0.2401 |
104
+ | 2.6324 | 2.99 | 109 | 2.1227 | 6539 | 1846 | 702 | 240 | 17173 | 14969 | 12765 | 10561 | 38.0772 | 12.3322 | 5.4994 | 2.2725 | 0.7887 | 17173 | 21250 | 0.2729 | 0.1154 | 0.2601 | 0.2604 | 0.0027 | 6.9028 | 13.2319 | 0.2663 |
105
+ | 2.5557 | 3.98 | 145 | 2.0357 | 6491 | 1923 | 752 | 275 | 15961 | 13757 | 11553 | 9349 | 40.6679 | 13.9783 | 6.5091 | 2.9415 | 0.7179 | 15961 | 21250 | 0.2783 | 0.1214 | 0.2676 | 0.2678 | 0.0059 | 7.3331 | 12.0962 | 0.2729 |
106
+ | 2.3785 | 5.0 | 182 | 1.9824 | 6808 | 2113 | 855 | 328 | 16439 | 14235 | 12031 | 9827 | 41.4137 | 14.8437 | 7.1066 | 3.3377 | 0.7463 | 16439 | 21250 | 0.2948 | 0.1326 | 0.2825 | 0.2825 | 0.0064 | 8.2007 | 12.6819 | 0.2892 |
107
+ | 2.3396 | 5.99 | 218 | 1.9449 | 7033 | 2194 | 886 | 364 | 16851 | 14647 | 12443 | 10239 | 41.7364 | 14.9792 | 7.1205 | 3.555 | 0.7702 | 16851 | 21250 | 0.3044 | 0.1373 | 0.292 | 0.2922 | 0.0086 | 8.639 | 13.0254 | 0.3 |
108
+ | 2.2557 | 6.98 | 254 | 1.8938 | 7167 | 2285 | 939 | 389 | 16529 | 14325 | 12121 | 9917 | 43.3602 | 15.9511 | 7.7469 | 3.9226 | 0.7515 | 16529 | 21250 | 0.3166 | 0.1428 | 0.3043 | 0.3046 | 0.0095 | 9.049 | 12.7119 | 0.3119 |
109
+ | 2.1168 | 7.99 | 291 | 1.8575 | 7347 | 2425 | 1021 | 425 | 16860 | 14656 | 12452 | 10248 | 43.5765 | 16.5461 | 8.1995 | 4.1472 | 0.7708 | 16860 | 21250 | 0.3258 | 0.1505 | 0.3137 | 0.3142 | 0.0104 | 9.6447 | 12.9374 | 0.3211 |
110
+ | 2.1105 | 8.98 | 327 | 1.8284 | 7460 | 2461 | 1061 | 449 | 17034 | 14830 | 12626 | 10422 | 43.7948 | 16.5947 | 8.4033 | 4.3082 | 0.7807 | 17034 | 21250 | 0.3317 | 0.1521 | 0.3187 | 0.3191 | 0.0095 | 9.9436 | 13.1828 | 0.3267 |
111
+ | 1.9913 | 10.0 | 364 | 1.8057 | 7547 | 2537 | 1105 | 487 | 17005 | 14801 | 12597 | 10393 | 44.3811 | 17.1407 | 8.7719 | 4.6858 | 0.7791 | 17005 | 21250 | 0.335 | 0.1566 | 0.323 | 0.3233 | 0.0113 | 10.3601 | 13.0358 | 0.3316 |
112
+ | 1.9943 | 10.99 | 400 | 1.7973 | 7629 | 2574 | 1131 | 496 | 16842 | 14638 | 12434 | 10230 | 45.2975 | 17.5844 | 9.096 | 4.8485 | 0.7697 | 16842 | 21250 | 0.343 | 0.1594 | 0.3296 | 0.33 | 0.0113 | 10.5378 | 13.0154 | 0.3385 |
113
+ | 1.941 | 11.98 | 436 | 1.7773 | 7681 | 2606 | 1164 | 528 | 17105 | 14901 | 12697 | 10493 | 44.905 | 17.4888 | 9.1675 | 5.0319 | 0.7848 | 17105 | 21250 | 0.3421 | 0.1607 | 0.3295 | 0.3294 | 0.0132 | 10.8273 | 13.1361 | 0.3385 |
114
+ | 1.8453 | 12.99 | 473 | 1.7595 | 7817 | 2700 | 1224 | 560 | 17324 | 15120 | 12916 | 10712 | 45.1224 | 17.8571 | 9.4766 | 5.2278 | 0.7972 | 17324 | 21250 | 0.3492 | 0.1662 | 0.3367 | 0.3367 | 0.0127 | 11.2687 | 13.5018 | 0.3447 |
115
+ | 1.85 | 13.98 | 509 | 1.7414 | 7792 | 2642 | 1182 | 537 | 17417 | 15213 | 13009 | 10805 | 44.7379 | 17.3667 | 9.086 | 4.9699 | 0.8025 | 17417 | 21250 | 0.3458 | 0.1632 | 0.3322 | 0.3322 | 0.0127 | 10.9825 | 13.5395 | 0.3416 |
116
+ | 1.7588 | 15.0 | 546 | 1.7346 | 7827 | 2702 | 1223 | 569 | 17265 | 15061 | 12857 | 10653 | 45.3345 | 17.9404 | 9.5123 | 5.3412 | 0.7939 | 17265 | 21250 | 0.3487 | 0.1661 | 0.3355 | 0.3354 | 0.015 | 11.3189 | 13.3026 | 0.3446 |
117
+ | 1.7663 | 15.99 | 582 | 1.7191 | 7946 | 2757 | 1245 | 581 | 17431 | 15227 | 13023 | 10819 | 45.5855 | 18.106 | 9.56 | 5.3702 | 0.8032 | 17431 | 21250 | 0.3544 | 0.1695 | 0.3418 | 0.3416 | 0.0154 | 11.5245 | 13.4515 | 0.3501 |
118
+ | 1.7317 | 16.98 | 618 | 1.7133 | 8068 | 2844 | 1325 | 633 | 17752 | 15548 | 13344 | 11140 | 45.4484 | 18.2917 | 9.9296 | 5.6822 | 0.8212 | 17752 | 21250 | 0.3575 | 0.1746 | 0.3445 | 0.3447 | 0.0163 | 12.0845 | 13.77 | 0.3527 |
119
+ | 1.6421 | 17.99 | 655 | 1.7198 | 8003 | 2823 | 1301 | 609 | 17535 | 15331 | 13127 | 10923 | 45.6401 | 18.4137 | 9.9109 | 5.5754 | 0.8091 | 17535 | 21250 | 0.3576 | 0.1737 | 0.3447 | 0.3448 | 0.015 | 11.877 | 13.4669 | 0.353 |
120
+ | 1.6543 | 18.98 | 691 | 1.7151 | 8031 | 2817 | 1294 | 612 | 17803 | 15599 | 13395 | 11191 | 45.1104 | 18.0588 | 9.6603 | 5.4687 | 0.824 | 17803 | 21250 | 0.3567 | 0.1734 | 0.3435 | 0.3431 | 0.015 | 11.8679 | 13.8648 | 0.351 |
121
+ | 1.5702 | 19.78 | 720 | 1.7079 | 7996 | 2850 | 1330 | 639 | 17275 | 15071 | 12867 | 10663 | 46.2865 | 18.9105 | 10.3365 | 5.9927 | 0.7945 | 17275 | 21250 | 0.3618 | 0.1769 | 0.3485 | 0.348 | 0.0168 | 12.1229 | 13.3367 | 0.3569 |
122
+
123
+
124
+ ### Framework versions
125
+
126
+ - Transformers 4.32.1
127
+ - Pytorch 2.1.0
128
+ - Datasets 2.12.0
129
+ - Tokenizers 0.13.3
added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<hl>": 32100
3
+ }
all_results.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 19.78,
3
+ "eval_bleu": 12.1229,
4
+ "eval_bp": 0.7945,
5
+ "eval_counts_1": 7996,
6
+ "eval_counts_2": 2850,
7
+ "eval_counts_3": 1330,
8
+ "eval_counts_4": 639,
9
+ "eval_exact_match": 0.0168,
10
+ "eval_f1": 0.3569,
11
+ "eval_gen_len": 13.3367,
12
+ "eval_loss": 1.7079344987869263,
13
+ "eval_precisions_1": 46.2865,
14
+ "eval_precisions_2": 18.9105,
15
+ "eval_precisions_3": 10.3365,
16
+ "eval_precisions_4": 5.9927,
17
+ "eval_ref_len": 21250,
18
+ "eval_rouge1": 0.3618,
19
+ "eval_rouge2": 0.1769,
20
+ "eval_rougeL": 0.3485,
21
+ "eval_rougeLsum": 0.348,
22
+ "eval_runtime": 891.7338,
23
+ "eval_samples": 2204,
24
+ "eval_samples_per_second": 2.472,
25
+ "eval_steps_per_second": 0.618,
26
+ "eval_sys_len": 17275,
27
+ "eval_totals_1": 17275,
28
+ "eval_totals_2": 15071,
29
+ "eval_totals_3": 12867,
30
+ "eval_totals_4": 10663,
31
+ "predict_bleu": 10.2687,
32
+ "predict_bp": 0.8235,
33
+ "predict_counts_1": 7523,
34
+ "predict_counts_2": 2449,
35
+ "predict_counts_3": 1054,
36
+ "predict_counts_4": 463,
37
+ "predict_exact_match": 0.015,
38
+ "predict_f1": 0.3331,
39
+ "predict_gen_len": 13.824,
40
+ "predict_loss": 1.841234803199768,
41
+ "predict_precisions_1": 43.2083,
42
+ "predict_precisions_2": 16.1044,
43
+ "predict_precisions_3": 8.1058,
44
+ "predict_precisions_4": 4.2874,
45
+ "predict_ref_len": 20793,
46
+ "predict_rouge1": 0.3412,
47
+ "predict_rouge2": 0.1622,
48
+ "predict_rougeL": 0.3308,
49
+ "predict_rougeLsum": 0.3307,
50
+ "predict_runtime": 932.2715,
51
+ "predict_samples": 2204,
52
+ "predict_samples_per_second": 2.364,
53
+ "predict_steps_per_second": 0.591,
54
+ "predict_sys_len": 17411,
55
+ "predict_totals_1": 17411,
56
+ "predict_totals_2": 15207,
57
+ "predict_totals_3": 13003,
58
+ "predict_totals_4": 10799,
59
+ "train_loss": 2.1398978657192655,
60
+ "train_runtime": 23260.8504,
61
+ "train_samples": 9314,
62
+ "train_samples_per_second": 8.008,
63
+ "train_steps_per_second": 0.031
64
+ }
config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/long-t5-tglobal-base",
3
+ "architectures": [
4
+ "LongT5ForConditionalGeneration"
5
+ ],
6
+ "d_ff": 2048,
7
+ "d_kv": 64,
8
+ "d_model": 768,
9
+ "decoder_start_token_id": 0,
10
+ "dense_act_fn": "gelu_new",
11
+ "dropout_rate": 0.1,
12
+ "encoder_attention_type": "transient-global",
13
+ "eos_token_id": 1,
14
+ "feed_forward_proj": "gated-gelu",
15
+ "global_block_size": 16,
16
+ "initializer_factor": 1.0,
17
+ "is_encoder_decoder": true,
18
+ "is_gated_act": true,
19
+ "layer_norm_epsilon": 1e-06,
20
+ "length_penalty": 0.0,
21
+ "local_radius": 127,
22
+ "max_length": 64,
23
+ "model_type": "longt5",
24
+ "n_positions": 4096,
25
+ "num_beams": 4,
26
+ "num_decoder_layers": 12,
27
+ "num_heads": 12,
28
+ "num_layers": 12,
29
+ "output_past": true,
30
+ "pad_token_id": 0,
31
+ "relative_attention_max_distance": 128,
32
+ "relative_attention_num_buckets": 32,
33
+ "tie_word_embeddings": false,
34
+ "torch_dtype": "float32",
35
+ "transformers_version": "4.32.1",
36
+ "use_cache": true,
37
+ "vocab_size": 32128
38
+ }
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "decoder_start_token_id": 0,
3
+ "eos_token_id": 1,
4
+ "length_penalty": 0.0,
5
+ "max_length": 64,
6
+ "num_beams": 4,
7
+ "pad_token_id": 0,
8
+ "transformers_version": "4.32.1"
9
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80128d7455f50bb423b0dda63a6a116502bbde3ec7ae08f18eae97b67068e707
3
+ size 990386200
special_tokens_map.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<hl>"
4
+ ],
5
+ "eos_token": "</s>",
6
+ "pad_token": "<pad>",
7
+ "unk_token": "<unk>"
8
+ }
spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d60acb128cf7b7f2536e8f38a5b18a05535c9e14c7a355904270e15b0945ea86
3
+ size 791656
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2153a86b626afc71e520a97d38dfe6cac812f17acb678834259347d1d74dc757
3
+ size 2422275
tokenizer_config.json ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<extra_id_0>",
4
+ "<extra_id_1>",
5
+ "<extra_id_2>",
6
+ "<extra_id_3>",
7
+ "<extra_id_4>",
8
+ "<extra_id_5>",
9
+ "<extra_id_6>",
10
+ "<extra_id_7>",
11
+ "<extra_id_8>",
12
+ "<extra_id_9>",
13
+ "<extra_id_10>",
14
+ "<extra_id_11>",
15
+ "<extra_id_12>",
16
+ "<extra_id_13>",
17
+ "<extra_id_14>",
18
+ "<extra_id_15>",
19
+ "<extra_id_16>",
20
+ "<extra_id_17>",
21
+ "<extra_id_18>",
22
+ "<extra_id_19>",
23
+ "<extra_id_20>",
24
+ "<extra_id_21>",
25
+ "<extra_id_22>",
26
+ "<extra_id_23>",
27
+ "<extra_id_24>",
28
+ "<extra_id_25>",
29
+ "<extra_id_26>",
30
+ "<extra_id_27>",
31
+ "<extra_id_28>",
32
+ "<extra_id_29>",
33
+ "<extra_id_30>",
34
+ "<extra_id_31>",
35
+ "<extra_id_32>",
36
+ "<extra_id_33>",
37
+ "<extra_id_34>",
38
+ "<extra_id_35>",
39
+ "<extra_id_36>",
40
+ "<extra_id_37>",
41
+ "<extra_id_38>",
42
+ "<extra_id_39>",
43
+ "<extra_id_40>",
44
+ "<extra_id_41>",
45
+ "<extra_id_42>",
46
+ "<extra_id_43>",
47
+ "<extra_id_44>",
48
+ "<extra_id_45>",
49
+ "<extra_id_46>",
50
+ "<extra_id_47>",
51
+ "<extra_id_48>",
52
+ "<extra_id_49>",
53
+ "<extra_id_50>",
54
+ "<extra_id_51>",
55
+ "<extra_id_52>",
56
+ "<extra_id_53>",
57
+ "<extra_id_54>",
58
+ "<extra_id_55>",
59
+ "<extra_id_56>",
60
+ "<extra_id_57>",
61
+ "<extra_id_58>",
62
+ "<extra_id_59>",
63
+ "<extra_id_60>",
64
+ "<extra_id_61>",
65
+ "<extra_id_62>",
66
+ "<extra_id_63>",
67
+ "<extra_id_64>",
68
+ "<extra_id_65>",
69
+ "<extra_id_66>",
70
+ "<extra_id_67>",
71
+ "<extra_id_68>",
72
+ "<extra_id_69>",
73
+ "<extra_id_70>",
74
+ "<extra_id_71>",
75
+ "<extra_id_72>",
76
+ "<extra_id_73>",
77
+ "<extra_id_74>",
78
+ "<extra_id_75>",
79
+ "<extra_id_76>",
80
+ "<extra_id_77>",
81
+ "<extra_id_78>",
82
+ "<extra_id_79>",
83
+ "<extra_id_80>",
84
+ "<extra_id_81>",
85
+ "<extra_id_82>",
86
+ "<extra_id_83>",
87
+ "<extra_id_84>",
88
+ "<extra_id_85>",
89
+ "<extra_id_86>",
90
+ "<extra_id_87>",
91
+ "<extra_id_88>",
92
+ "<extra_id_89>",
93
+ "<extra_id_90>",
94
+ "<extra_id_91>",
95
+ "<extra_id_92>",
96
+ "<extra_id_93>",
97
+ "<extra_id_94>",
98
+ "<extra_id_95>",
99
+ "<extra_id_96>",
100
+ "<extra_id_97>",
101
+ "<extra_id_98>",
102
+ "<extra_id_99>"
103
+ ],
104
+ "clean_up_tokenization_spaces": true,
105
+ "eos_token": "</s>",
106
+ "extra_ids": 100,
107
+ "model_max_length": 1000000000000000019884624838656,
108
+ "pad_token": "<pad>",
109
+ "tokenizer_class": "T5Tokenizer",
110
+ "unk_token": "<unk>"
111
+ }
trainer_state.json ADDED
@@ -0,0 +1,768 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 19.776824034334766,
5
+ "eval_steps": 500,
6
+ "global_step": 720,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.99,
13
+ "learning_rate": 0.0001,
14
+ "loss": 3.6024,
15
+ "step": 36
16
+ },
17
+ {
18
+ "epoch": 0.99,
19
+ "eval_bleu": 4.4454,
20
+ "eval_bp": 0.6832,
21
+ "eval_counts_1": 5645,
22
+ "eval_counts_2": 1343,
23
+ "eval_counts_3": 424,
24
+ "eval_counts_4": 109,
25
+ "eval_exact_match": 0.0005,
26
+ "eval_f1": 0.2236,
27
+ "eval_gen_len": 11.6338,
28
+ "eval_loss": 2.468198776245117,
29
+ "eval_precisions_1": 36.6844,
30
+ "eval_precisions_2": 10.1866,
31
+ "eval_precisions_3": 3.8616,
32
+ "eval_precisions_4": 1.242,
33
+ "eval_ref_len": 21250,
34
+ "eval_rouge1": 0.2285,
35
+ "eval_rouge2": 0.0824,
36
+ "eval_rougeL": 0.2192,
37
+ "eval_rougeLsum": 0.2188,
38
+ "eval_runtime": 813.9917,
39
+ "eval_samples_per_second": 2.708,
40
+ "eval_steps_per_second": 0.677,
41
+ "eval_sys_len": 15388,
42
+ "eval_totals_1": 15388,
43
+ "eval_totals_2": 13184,
44
+ "eval_totals_3": 10980,
45
+ "eval_totals_4": 8776,
46
+ "step": 36
47
+ },
48
+ {
49
+ "epoch": 1.98,
50
+ "learning_rate": 0.0001,
51
+ "loss": 2.9671,
52
+ "step": 72
53
+ },
54
+ {
55
+ "epoch": 1.98,
56
+ "eval_bleu": 5.7163,
57
+ "eval_bp": 0.7259,
58
+ "eval_counts_1": 5988,
59
+ "eval_counts_2": 1562,
60
+ "eval_counts_3": 569,
61
+ "eval_counts_4": 179,
62
+ "eval_exact_match": 0.0018,
63
+ "eval_f1": 0.2401,
64
+ "eval_gen_len": 12.314,
65
+ "eval_loss": 2.244511842727661,
66
+ "eval_precisions_1": 37.2064,
67
+ "eval_precisions_2": 11.2455,
68
+ "eval_precisions_3": 4.8691,
69
+ "eval_precisions_4": 1.8878,
70
+ "eval_ref_len": 21250,
71
+ "eval_rouge1": 0.2465,
72
+ "eval_rouge2": 0.0971,
73
+ "eval_rougeL": 0.2371,
74
+ "eval_rougeLsum": 0.2371,
75
+ "eval_runtime": 802.4783,
76
+ "eval_samples_per_second": 2.746,
77
+ "eval_steps_per_second": 0.687,
78
+ "eval_sys_len": 16094,
79
+ "eval_totals_1": 16094,
80
+ "eval_totals_2": 13890,
81
+ "eval_totals_3": 11686,
82
+ "eval_totals_4": 9482,
83
+ "step": 72
84
+ },
85
+ {
86
+ "epoch": 2.99,
87
+ "learning_rate": 0.0001,
88
+ "loss": 2.6324,
89
+ "step": 109
90
+ },
91
+ {
92
+ "epoch": 2.99,
93
+ "eval_bleu": 6.9028,
94
+ "eval_bp": 0.7887,
95
+ "eval_counts_1": 6539,
96
+ "eval_counts_2": 1846,
97
+ "eval_counts_3": 702,
98
+ "eval_counts_4": 240,
99
+ "eval_exact_match": 0.0027,
100
+ "eval_f1": 0.2663,
101
+ "eval_gen_len": 13.2319,
102
+ "eval_loss": 2.122749090194702,
103
+ "eval_precisions_1": 38.0772,
104
+ "eval_precisions_2": 12.3322,
105
+ "eval_precisions_3": 5.4994,
106
+ "eval_precisions_4": 2.2725,
107
+ "eval_ref_len": 21250,
108
+ "eval_rouge1": 0.2729,
109
+ "eval_rouge2": 0.1154,
110
+ "eval_rougeL": 0.2601,
111
+ "eval_rougeLsum": 0.2604,
112
+ "eval_runtime": 822.9261,
113
+ "eval_samples_per_second": 2.678,
114
+ "eval_steps_per_second": 0.67,
115
+ "eval_sys_len": 17173,
116
+ "eval_totals_1": 17173,
117
+ "eval_totals_2": 14969,
118
+ "eval_totals_3": 12765,
119
+ "eval_totals_4": 10561,
120
+ "step": 109
121
+ },
122
+ {
123
+ "epoch": 3.98,
124
+ "learning_rate": 0.0001,
125
+ "loss": 2.5557,
126
+ "step": 145
127
+ },
128
+ {
129
+ "epoch": 3.98,
130
+ "eval_bleu": 7.3331,
131
+ "eval_bp": 0.7179,
132
+ "eval_counts_1": 6491,
133
+ "eval_counts_2": 1923,
134
+ "eval_counts_3": 752,
135
+ "eval_counts_4": 275,
136
+ "eval_exact_match": 0.0059,
137
+ "eval_f1": 0.2729,
138
+ "eval_gen_len": 12.0962,
139
+ "eval_loss": 2.035691022872925,
140
+ "eval_precisions_1": 40.6679,
141
+ "eval_precisions_2": 13.9783,
142
+ "eval_precisions_3": 6.5091,
143
+ "eval_precisions_4": 2.9415,
144
+ "eval_ref_len": 21250,
145
+ "eval_rouge1": 0.2783,
146
+ "eval_rouge2": 0.1214,
147
+ "eval_rougeL": 0.2676,
148
+ "eval_rougeLsum": 0.2678,
149
+ "eval_runtime": 786.7967,
150
+ "eval_samples_per_second": 2.801,
151
+ "eval_steps_per_second": 0.7,
152
+ "eval_sys_len": 15961,
153
+ "eval_totals_1": 15961,
154
+ "eval_totals_2": 13757,
155
+ "eval_totals_3": 11553,
156
+ "eval_totals_4": 9349,
157
+ "step": 145
158
+ },
159
+ {
160
+ "epoch": 5.0,
161
+ "learning_rate": 0.0001,
162
+ "loss": 2.3785,
163
+ "step": 182
164
+ },
165
+ {
166
+ "epoch": 5.0,
167
+ "eval_bleu": 8.2007,
168
+ "eval_bp": 0.7463,
169
+ "eval_counts_1": 6808,
170
+ "eval_counts_2": 2113,
171
+ "eval_counts_3": 855,
172
+ "eval_counts_4": 328,
173
+ "eval_exact_match": 0.0064,
174
+ "eval_f1": 0.2892,
175
+ "eval_gen_len": 12.6819,
176
+ "eval_loss": 1.9824347496032715,
177
+ "eval_precisions_1": 41.4137,
178
+ "eval_precisions_2": 14.8437,
179
+ "eval_precisions_3": 7.1066,
180
+ "eval_precisions_4": 3.3377,
181
+ "eval_ref_len": 21250,
182
+ "eval_rouge1": 0.2948,
183
+ "eval_rouge2": 0.1326,
184
+ "eval_rougeL": 0.2825,
185
+ "eval_rougeLsum": 0.2825,
186
+ "eval_runtime": 806.3535,
187
+ "eval_samples_per_second": 2.733,
188
+ "eval_steps_per_second": 0.683,
189
+ "eval_sys_len": 16439,
190
+ "eval_totals_1": 16439,
191
+ "eval_totals_2": 14235,
192
+ "eval_totals_3": 12031,
193
+ "eval_totals_4": 9827,
194
+ "step": 182
195
+ },
196
+ {
197
+ "epoch": 5.99,
198
+ "learning_rate": 0.0001,
199
+ "loss": 2.3396,
200
+ "step": 218
201
+ },
202
+ {
203
+ "epoch": 5.99,
204
+ "eval_bleu": 8.639,
205
+ "eval_bp": 0.7702,
206
+ "eval_counts_1": 7033,
207
+ "eval_counts_2": 2194,
208
+ "eval_counts_3": 886,
209
+ "eval_counts_4": 364,
210
+ "eval_exact_match": 0.0086,
211
+ "eval_f1": 0.3,
212
+ "eval_gen_len": 13.0254,
213
+ "eval_loss": 1.9448895454406738,
214
+ "eval_precisions_1": 41.7364,
215
+ "eval_precisions_2": 14.9792,
216
+ "eval_precisions_3": 7.1205,
217
+ "eval_precisions_4": 3.555,
218
+ "eval_ref_len": 21250,
219
+ "eval_rouge1": 0.3044,
220
+ "eval_rouge2": 0.1373,
221
+ "eval_rougeL": 0.292,
222
+ "eval_rougeLsum": 0.2922,
223
+ "eval_runtime": 473.2306,
224
+ "eval_samples_per_second": 4.657,
225
+ "eval_steps_per_second": 1.164,
226
+ "eval_sys_len": 16851,
227
+ "eval_totals_1": 16851,
228
+ "eval_totals_2": 14647,
229
+ "eval_totals_3": 12443,
230
+ "eval_totals_4": 10239,
231
+ "step": 218
232
+ },
233
+ {
234
+ "epoch": 6.98,
235
+ "learning_rate": 0.0001,
236
+ "loss": 2.2557,
237
+ "step": 254
238
+ },
239
+ {
240
+ "epoch": 6.98,
241
+ "eval_bleu": 9.049,
242
+ "eval_bp": 0.7515,
243
+ "eval_counts_1": 7167,
244
+ "eval_counts_2": 2285,
245
+ "eval_counts_3": 939,
246
+ "eval_counts_4": 389,
247
+ "eval_exact_match": 0.0095,
248
+ "eval_f1": 0.3119,
249
+ "eval_gen_len": 12.7119,
250
+ "eval_loss": 1.8937886953353882,
251
+ "eval_precisions_1": 43.3602,
252
+ "eval_precisions_2": 15.9511,
253
+ "eval_precisions_3": 7.7469,
254
+ "eval_precisions_4": 3.9226,
255
+ "eval_ref_len": 21250,
256
+ "eval_rouge1": 0.3166,
257
+ "eval_rouge2": 0.1428,
258
+ "eval_rougeL": 0.3043,
259
+ "eval_rougeLsum": 0.3046,
260
+ "eval_runtime": 453.3958,
261
+ "eval_samples_per_second": 4.861,
262
+ "eval_steps_per_second": 1.215,
263
+ "eval_sys_len": 16529,
264
+ "eval_totals_1": 16529,
265
+ "eval_totals_2": 14325,
266
+ "eval_totals_3": 12121,
267
+ "eval_totals_4": 9917,
268
+ "step": 254
269
+ },
270
+ {
271
+ "epoch": 7.99,
272
+ "learning_rate": 0.0001,
273
+ "loss": 2.1168,
274
+ "step": 291
275
+ },
276
+ {
277
+ "epoch": 7.99,
278
+ "eval_bleu": 9.6447,
279
+ "eval_bp": 0.7708,
280
+ "eval_counts_1": 7347,
281
+ "eval_counts_2": 2425,
282
+ "eval_counts_3": 1021,
283
+ "eval_counts_4": 425,
284
+ "eval_exact_match": 0.0104,
285
+ "eval_f1": 0.3211,
286
+ "eval_gen_len": 12.9374,
287
+ "eval_loss": 1.857459306716919,
288
+ "eval_precisions_1": 43.5765,
289
+ "eval_precisions_2": 16.5461,
290
+ "eval_precisions_3": 8.1995,
291
+ "eval_precisions_4": 4.1472,
292
+ "eval_ref_len": 21250,
293
+ "eval_rouge1": 0.3258,
294
+ "eval_rouge2": 0.1505,
295
+ "eval_rougeL": 0.3137,
296
+ "eval_rougeLsum": 0.3142,
297
+ "eval_runtime": 457.8255,
298
+ "eval_samples_per_second": 4.814,
299
+ "eval_steps_per_second": 1.204,
300
+ "eval_sys_len": 16860,
301
+ "eval_totals_1": 16860,
302
+ "eval_totals_2": 14656,
303
+ "eval_totals_3": 12452,
304
+ "eval_totals_4": 10248,
305
+ "step": 291
306
+ },
307
+ {
308
+ "epoch": 8.98,
309
+ "learning_rate": 0.0001,
310
+ "loss": 2.1105,
311
+ "step": 327
312
+ },
313
+ {
314
+ "epoch": 8.98,
315
+ "eval_bleu": 9.9436,
316
+ "eval_bp": 0.7807,
317
+ "eval_counts_1": 7460,
318
+ "eval_counts_2": 2461,
319
+ "eval_counts_3": 1061,
320
+ "eval_counts_4": 449,
321
+ "eval_exact_match": 0.0095,
322
+ "eval_f1": 0.3267,
323
+ "eval_gen_len": 13.1828,
324
+ "eval_loss": 1.8283559083938599,
325
+ "eval_precisions_1": 43.7948,
326
+ "eval_precisions_2": 16.5947,
327
+ "eval_precisions_3": 8.4033,
328
+ "eval_precisions_4": 4.3082,
329
+ "eval_ref_len": 21250,
330
+ "eval_rouge1": 0.3317,
331
+ "eval_rouge2": 0.1521,
332
+ "eval_rougeL": 0.3187,
333
+ "eval_rougeLsum": 0.3191,
334
+ "eval_runtime": 464.6,
335
+ "eval_samples_per_second": 4.744,
336
+ "eval_steps_per_second": 1.186,
337
+ "eval_sys_len": 17034,
338
+ "eval_totals_1": 17034,
339
+ "eval_totals_2": 14830,
340
+ "eval_totals_3": 12626,
341
+ "eval_totals_4": 10422,
342
+ "step": 327
343
+ },
344
+ {
345
+ "epoch": 10.0,
346
+ "learning_rate": 0.0001,
347
+ "loss": 1.9913,
348
+ "step": 364
349
+ },
350
+ {
351
+ "epoch": 10.0,
352
+ "eval_bleu": 10.3601,
353
+ "eval_bp": 0.7791,
354
+ "eval_counts_1": 7547,
355
+ "eval_counts_2": 2537,
356
+ "eval_counts_3": 1105,
357
+ "eval_counts_4": 487,
358
+ "eval_exact_match": 0.0113,
359
+ "eval_f1": 0.3316,
360
+ "eval_gen_len": 13.0358,
361
+ "eval_loss": 1.8056522607803345,
362
+ "eval_precisions_1": 44.3811,
363
+ "eval_precisions_2": 17.1407,
364
+ "eval_precisions_3": 8.7719,
365
+ "eval_precisions_4": 4.6858,
366
+ "eval_ref_len": 21250,
367
+ "eval_rouge1": 0.335,
368
+ "eval_rouge2": 0.1566,
369
+ "eval_rougeL": 0.323,
370
+ "eval_rougeLsum": 0.3233,
371
+ "eval_runtime": 492.674,
372
+ "eval_samples_per_second": 4.474,
373
+ "eval_steps_per_second": 1.118,
374
+ "eval_sys_len": 17005,
375
+ "eval_totals_1": 17005,
376
+ "eval_totals_2": 14801,
377
+ "eval_totals_3": 12597,
378
+ "eval_totals_4": 10393,
379
+ "step": 364
380
+ },
381
+ {
382
+ "epoch": 10.99,
383
+ "learning_rate": 0.0001,
384
+ "loss": 1.9943,
385
+ "step": 400
386
+ },
387
+ {
388
+ "epoch": 10.99,
389
+ "eval_bleu": 10.5378,
390
+ "eval_bp": 0.7697,
391
+ "eval_counts_1": 7629,
392
+ "eval_counts_2": 2574,
393
+ "eval_counts_3": 1131,
394
+ "eval_counts_4": 496,
395
+ "eval_exact_match": 0.0113,
396
+ "eval_f1": 0.3385,
397
+ "eval_gen_len": 13.0154,
398
+ "eval_loss": 1.7973003387451172,
399
+ "eval_precisions_1": 45.2975,
400
+ "eval_precisions_2": 17.5844,
401
+ "eval_precisions_3": 9.096,
402
+ "eval_precisions_4": 4.8485,
403
+ "eval_ref_len": 21250,
404
+ "eval_rouge1": 0.343,
405
+ "eval_rouge2": 0.1594,
406
+ "eval_rougeL": 0.3296,
407
+ "eval_rougeLsum": 0.33,
408
+ "eval_runtime": 454.7448,
409
+ "eval_samples_per_second": 4.847,
410
+ "eval_steps_per_second": 1.212,
411
+ "eval_sys_len": 16842,
412
+ "eval_totals_1": 16842,
413
+ "eval_totals_2": 14638,
414
+ "eval_totals_3": 12434,
415
+ "eval_totals_4": 10230,
416
+ "step": 400
417
+ },
418
+ {
419
+ "epoch": 11.98,
420
+ "learning_rate": 0.0001,
421
+ "loss": 1.941,
422
+ "step": 436
423
+ },
424
+ {
425
+ "epoch": 11.98,
426
+ "eval_bleu": 10.8273,
427
+ "eval_bp": 0.7848,
428
+ "eval_counts_1": 7681,
429
+ "eval_counts_2": 2606,
430
+ "eval_counts_3": 1164,
431
+ "eval_counts_4": 528,
432
+ "eval_exact_match": 0.0132,
433
+ "eval_f1": 0.3385,
434
+ "eval_gen_len": 13.1361,
435
+ "eval_loss": 1.777303695678711,
436
+ "eval_precisions_1": 44.905,
437
+ "eval_precisions_2": 17.4888,
438
+ "eval_precisions_3": 9.1675,
439
+ "eval_precisions_4": 5.0319,
440
+ "eval_ref_len": 21250,
441
+ "eval_rouge1": 0.3421,
442
+ "eval_rouge2": 0.1607,
443
+ "eval_rougeL": 0.3295,
444
+ "eval_rougeLsum": 0.3294,
445
+ "eval_runtime": 458.5033,
446
+ "eval_samples_per_second": 4.807,
447
+ "eval_steps_per_second": 1.202,
448
+ "eval_sys_len": 17105,
449
+ "eval_totals_1": 17105,
450
+ "eval_totals_2": 14901,
451
+ "eval_totals_3": 12697,
452
+ "eval_totals_4": 10493,
453
+ "step": 436
454
+ },
455
+ {
456
+ "epoch": 12.99,
457
+ "learning_rate": 0.0001,
458
+ "loss": 1.8453,
459
+ "step": 473
460
+ },
461
+ {
462
+ "epoch": 12.99,
463
+ "eval_bleu": 11.2687,
464
+ "eval_bp": 0.7972,
465
+ "eval_counts_1": 7817,
466
+ "eval_counts_2": 2700,
467
+ "eval_counts_3": 1224,
468
+ "eval_counts_4": 560,
469
+ "eval_exact_match": 0.0127,
470
+ "eval_f1": 0.3447,
471
+ "eval_gen_len": 13.5018,
472
+ "eval_loss": 1.7595148086547852,
473
+ "eval_precisions_1": 45.1224,
474
+ "eval_precisions_2": 17.8571,
475
+ "eval_precisions_3": 9.4766,
476
+ "eval_precisions_4": 5.2278,
477
+ "eval_ref_len": 21250,
478
+ "eval_rouge1": 0.3492,
479
+ "eval_rouge2": 0.1662,
480
+ "eval_rougeL": 0.3367,
481
+ "eval_rougeLsum": 0.3367,
482
+ "eval_runtime": 465.5444,
483
+ "eval_samples_per_second": 4.734,
484
+ "eval_steps_per_second": 1.184,
485
+ "eval_sys_len": 17324,
486
+ "eval_totals_1": 17324,
487
+ "eval_totals_2": 15120,
488
+ "eval_totals_3": 12916,
489
+ "eval_totals_4": 10712,
490
+ "step": 473
491
+ },
492
+ {
493
+ "epoch": 13.98,
494
+ "learning_rate": 0.0001,
495
+ "loss": 1.85,
496
+ "step": 509
497
+ },
498
+ {
499
+ "epoch": 13.98,
500
+ "eval_bleu": 10.9825,
501
+ "eval_bp": 0.8025,
502
+ "eval_counts_1": 7792,
503
+ "eval_counts_2": 2642,
504
+ "eval_counts_3": 1182,
505
+ "eval_counts_4": 537,
506
+ "eval_exact_match": 0.0127,
507
+ "eval_f1": 0.3416,
508
+ "eval_gen_len": 13.5395,
509
+ "eval_loss": 1.7414402961730957,
510
+ "eval_precisions_1": 44.7379,
511
+ "eval_precisions_2": 17.3667,
512
+ "eval_precisions_3": 9.086,
513
+ "eval_precisions_4": 4.9699,
514
+ "eval_ref_len": 21250,
515
+ "eval_rouge1": 0.3458,
516
+ "eval_rouge2": 0.1632,
517
+ "eval_rougeL": 0.3322,
518
+ "eval_rougeLsum": 0.3322,
519
+ "eval_runtime": 468.8552,
520
+ "eval_samples_per_second": 4.701,
521
+ "eval_steps_per_second": 1.175,
522
+ "eval_sys_len": 17417,
523
+ "eval_totals_1": 17417,
524
+ "eval_totals_2": 15213,
525
+ "eval_totals_3": 13009,
526
+ "eval_totals_4": 10805,
527
+ "step": 509
528
+ },
529
+ {
530
+ "epoch": 15.0,
531
+ "learning_rate": 0.0001,
532
+ "loss": 1.7588,
533
+ "step": 546
534
+ },
535
+ {
536
+ "epoch": 15.0,
537
+ "eval_bleu": 11.3189,
538
+ "eval_bp": 0.7939,
539
+ "eval_counts_1": 7827,
540
+ "eval_counts_2": 2702,
541
+ "eval_counts_3": 1223,
542
+ "eval_counts_4": 569,
543
+ "eval_exact_match": 0.015,
544
+ "eval_f1": 0.3446,
545
+ "eval_gen_len": 13.3026,
546
+ "eval_loss": 1.7346255779266357,
547
+ "eval_precisions_1": 45.3345,
548
+ "eval_precisions_2": 17.9404,
549
+ "eval_precisions_3": 9.5123,
550
+ "eval_precisions_4": 5.3412,
551
+ "eval_ref_len": 21250,
552
+ "eval_rouge1": 0.3487,
553
+ "eval_rouge2": 0.1661,
554
+ "eval_rougeL": 0.3355,
555
+ "eval_rougeLsum": 0.3354,
556
+ "eval_runtime": 464.8491,
557
+ "eval_samples_per_second": 4.741,
558
+ "eval_steps_per_second": 1.185,
559
+ "eval_sys_len": 17265,
560
+ "eval_totals_1": 17265,
561
+ "eval_totals_2": 15061,
562
+ "eval_totals_3": 12857,
563
+ "eval_totals_4": 10653,
564
+ "step": 546
565
+ },
566
+ {
567
+ "epoch": 15.99,
568
+ "learning_rate": 0.0001,
569
+ "loss": 1.7663,
570
+ "step": 582
571
+ },
572
+ {
573
+ "epoch": 15.99,
574
+ "eval_bleu": 11.5245,
575
+ "eval_bp": 0.8032,
576
+ "eval_counts_1": 7946,
577
+ "eval_counts_2": 2757,
578
+ "eval_counts_3": 1245,
579
+ "eval_counts_4": 581,
580
+ "eval_exact_match": 0.0154,
581
+ "eval_f1": 0.3501,
582
+ "eval_gen_len": 13.4515,
583
+ "eval_loss": 1.7190728187561035,
584
+ "eval_precisions_1": 45.5855,
585
+ "eval_precisions_2": 18.106,
586
+ "eval_precisions_3": 9.56,
587
+ "eval_precisions_4": 5.3702,
588
+ "eval_ref_len": 21250,
589
+ "eval_rouge1": 0.3544,
590
+ "eval_rouge2": 0.1695,
591
+ "eval_rougeL": 0.3418,
592
+ "eval_rougeLsum": 0.3416,
593
+ "eval_runtime": 465.8123,
594
+ "eval_samples_per_second": 4.732,
595
+ "eval_steps_per_second": 1.183,
596
+ "eval_sys_len": 17431,
597
+ "eval_totals_1": 17431,
598
+ "eval_totals_2": 15227,
599
+ "eval_totals_3": 13023,
600
+ "eval_totals_4": 10819,
601
+ "step": 582
602
+ },
603
+ {
604
+ "epoch": 16.98,
605
+ "learning_rate": 0.0001,
606
+ "loss": 1.7317,
607
+ "step": 618
608
+ },
609
+ {
610
+ "epoch": 16.98,
611
+ "eval_bleu": 12.0845,
612
+ "eval_bp": 0.8212,
613
+ "eval_counts_1": 8068,
614
+ "eval_counts_2": 2844,
615
+ "eval_counts_3": 1325,
616
+ "eval_counts_4": 633,
617
+ "eval_exact_match": 0.0163,
618
+ "eval_f1": 0.3527,
619
+ "eval_gen_len": 13.77,
620
+ "eval_loss": 1.7133468389511108,
621
+ "eval_precisions_1": 45.4484,
622
+ "eval_precisions_2": 18.2917,
623
+ "eval_precisions_3": 9.9296,
624
+ "eval_precisions_4": 5.6822,
625
+ "eval_ref_len": 21250,
626
+ "eval_rouge1": 0.3575,
627
+ "eval_rouge2": 0.1746,
628
+ "eval_rougeL": 0.3445,
629
+ "eval_rougeLsum": 0.3447,
630
+ "eval_runtime": 458.8154,
631
+ "eval_samples_per_second": 4.804,
632
+ "eval_steps_per_second": 1.201,
633
+ "eval_sys_len": 17752,
634
+ "eval_totals_1": 17752,
635
+ "eval_totals_2": 15548,
636
+ "eval_totals_3": 13344,
637
+ "eval_totals_4": 11140,
638
+ "step": 618
639
+ },
640
+ {
641
+ "epoch": 17.99,
642
+ "learning_rate": 0.0001,
643
+ "loss": 1.6421,
644
+ "step": 655
645
+ },
646
+ {
647
+ "epoch": 17.99,
648
+ "eval_bleu": 11.877,
649
+ "eval_bp": 0.8091,
650
+ "eval_counts_1": 8003,
651
+ "eval_counts_2": 2823,
652
+ "eval_counts_3": 1301,
653
+ "eval_counts_4": 609,
654
+ "eval_exact_match": 0.015,
655
+ "eval_f1": 0.353,
656
+ "eval_gen_len": 13.4669,
657
+ "eval_loss": 1.719835877418518,
658
+ "eval_precisions_1": 45.6401,
659
+ "eval_precisions_2": 18.4137,
660
+ "eval_precisions_3": 9.9109,
661
+ "eval_precisions_4": 5.5754,
662
+ "eval_ref_len": 21250,
663
+ "eval_rouge1": 0.3576,
664
+ "eval_rouge2": 0.1737,
665
+ "eval_rougeL": 0.3447,
666
+ "eval_rougeLsum": 0.3448,
667
+ "eval_runtime": 467.8501,
668
+ "eval_samples_per_second": 4.711,
669
+ "eval_steps_per_second": 1.178,
670
+ "eval_sys_len": 17535,
671
+ "eval_totals_1": 17535,
672
+ "eval_totals_2": 15331,
673
+ "eval_totals_3": 13127,
674
+ "eval_totals_4": 10923,
675
+ "step": 655
676
+ },
677
+ {
678
+ "epoch": 18.98,
679
+ "learning_rate": 0.0001,
680
+ "loss": 1.6543,
681
+ "step": 691
682
+ },
683
+ {
684
+ "epoch": 18.98,
685
+ "eval_bleu": 11.8679,
686
+ "eval_bp": 0.824,
687
+ "eval_counts_1": 8031,
688
+ "eval_counts_2": 2817,
689
+ "eval_counts_3": 1294,
690
+ "eval_counts_4": 612,
691
+ "eval_exact_match": 0.015,
692
+ "eval_f1": 0.351,
693
+ "eval_gen_len": 13.8648,
694
+ "eval_loss": 1.715085506439209,
695
+ "eval_precisions_1": 45.1104,
696
+ "eval_precisions_2": 18.0588,
697
+ "eval_precisions_3": 9.6603,
698
+ "eval_precisions_4": 5.4687,
699
+ "eval_ref_len": 21250,
700
+ "eval_rouge1": 0.3567,
701
+ "eval_rouge2": 0.1734,
702
+ "eval_rougeL": 0.3435,
703
+ "eval_rougeLsum": 0.3431,
704
+ "eval_runtime": 748.2265,
705
+ "eval_samples_per_second": 2.946,
706
+ "eval_steps_per_second": 0.736,
707
+ "eval_sys_len": 17803,
708
+ "eval_totals_1": 17803,
709
+ "eval_totals_2": 15599,
710
+ "eval_totals_3": 13395,
711
+ "eval_totals_4": 11191,
712
+ "step": 691
713
+ },
714
+ {
715
+ "epoch": 19.78,
716
+ "learning_rate": 0.0001,
717
+ "loss": 1.5702,
718
+ "step": 720
719
+ },
720
+ {
721
+ "epoch": 19.78,
722
+ "eval_bleu": 12.1229,
723
+ "eval_bp": 0.7945,
724
+ "eval_counts_1": 7996,
725
+ "eval_counts_2": 2850,
726
+ "eval_counts_3": 1330,
727
+ "eval_counts_4": 639,
728
+ "eval_exact_match": 0.0168,
729
+ "eval_f1": 0.3569,
730
+ "eval_gen_len": 13.3367,
731
+ "eval_loss": 1.7079344987869263,
732
+ "eval_precisions_1": 46.2865,
733
+ "eval_precisions_2": 18.9105,
734
+ "eval_precisions_3": 10.3365,
735
+ "eval_precisions_4": 5.9927,
736
+ "eval_ref_len": 21250,
737
+ "eval_rouge1": 0.3618,
738
+ "eval_rouge2": 0.1769,
739
+ "eval_rougeL": 0.3485,
740
+ "eval_rougeLsum": 0.348,
741
+ "eval_runtime": 880.8231,
742
+ "eval_samples_per_second": 2.502,
743
+ "eval_steps_per_second": 0.626,
744
+ "eval_sys_len": 17275,
745
+ "eval_totals_1": 17275,
746
+ "eval_totals_2": 15071,
747
+ "eval_totals_3": 12867,
748
+ "eval_totals_4": 10663,
749
+ "step": 720
750
+ },
751
+ {
752
+ "epoch": 19.78,
753
+ "step": 720,
754
+ "total_flos": 2.52283256045568e+17,
755
+ "train_loss": 2.1398978657192655,
756
+ "train_runtime": 23260.8504,
757
+ "train_samples_per_second": 8.008,
758
+ "train_steps_per_second": 0.031
759
+ }
760
+ ],
761
+ "logging_steps": 500,
762
+ "max_steps": 720,
763
+ "num_train_epochs": 20,
764
+ "save_steps": 500,
765
+ "total_flos": 2.52283256045568e+17,
766
+ "trial_name": null,
767
+ "trial_params": null
768
+ }
training_args.bin ADDED
Binary file (4.66 kB). View file