Marvin commited on
Commit
bfe0903
0 Parent(s):

Initial commit

Browse files
.gitattributes ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
2
+ *.model filter=lfs diff=lfs merge=lfs -text
3
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
4
+ runs/** filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - de
4
+ tags:
5
+ - question-generation
6
+ - german
7
+ - text2text-generation
8
+ - generated_from_trainer
9
+ datasets:
10
+ - lmqg/qg_dequad
11
+ metrics:
12
+ - bleu4
13
+ - f1
14
+ - rouge
15
+ - exact_match
16
+ model-index:
17
+ - name: german-jeopardy-mt5-base-256
18
+ results:
19
+ - task:
20
+ name: Sequence-to-sequence Language Modeling
21
+ type: text2text-generation
22
+ dataset:
23
+ name: lmqg/qg_dequad
24
+ type: default
25
+ args: default
26
+ metrics:
27
+ - name: BLEU-4
28
+ type: bleu4
29
+ value: 13.70
30
+ - name: F1
31
+ type: f1
32
+ value: 37.79
33
+ - name: ROUGE-1
34
+ type: rouge1
35
+ value: 38.80
36
+ - name: ROUGE-2
37
+ type: rouge2
38
+ value: 20.27
39
+ - name: ROUGE-L
40
+ type: rougel
41
+ value: 37.34
42
+ - name: ROUGE-Lsum
43
+ type: rougelsum
44
+ value: 37.32
45
+ - name: Exact Match
46
+ type: exact_match
47
+ value: 2.81
48
+ ---
49
+
50
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
51
+ should probably proofread and complete it, then remove this comment. -->
52
+
53
+ # german-jeopardy-mt5-base-256
54
+
55
+ This model is a fine-tuned version of [google/mt5-base](https://huggingface.co/google/mt5-base) on the [lmqg/qg_dequad](https://huggingface.co/datasets/lmqg/qg_dequad) dataset.
56
+ It achieves the following results on the evaluation set:
57
+ - Loss: 1.51
58
+ - Brevity Penalty: 0.8658
59
+ - System Length: 18174
60
+ - Reference Length: 20793
61
+ - ROUGE-1: 38.80
62
+ - ROUGE-2: 20.27
63
+ - ROUGE-L: 37.34
64
+ - ROUGE-Lsum: 37.32
65
+ - Exact Match: 2.81
66
+ - BLEU: 13.70
67
+ - F1: 37.79
68
+
69
+ ## Model description
70
+
71
+ See [google/mt5-base](https://huggingface.co/google/mt5-base) for the model architecture.
72
+ The model was trained on a single NVIDIA RTX 3090 GPU with 24GB of VRAM.
73
+
74
+ ## Intended uses & limitations
75
+
76
+ This model can be used for question generation on German text.
77
+
78
+ ## Training and evaluation data
79
+
80
+ See [lmqg/qg_dequad](https://huggingface.co/datasets/lmqg/qg_dequad).
81
+
82
+ ## Training procedure
83
+
84
+ ### Training hyperparameters
85
+
86
+ The following hyperparameters were used during training:
87
+ - learning_rate: 0.0001
88
+ - train_batch_size: 4
89
+ - eval_batch_size: 4
90
+ - seed: 7
91
+ - gradient_accumulation_steps: 64
92
+ - total_train_batch_size: 256
93
+ - optimizer: Adafactor
94
+ - lr_scheduler_type: constant
95
+ - num_epochs: 20
96
+
97
+ ### Training results
98
+
99
+ | Training Loss | Epoch | Step | Validation Loss | Counts 1 | Counts 2 | Counts 3 | Counts 4 | Totals 1 | Totals 2 | Totals 3 | Totals 4 | Precisions 1 | Precisions 2 | Precisions 3 | Precisions 4 | Brevity Penalty | System Length | Reference Length | ROUGE-1 | ROUGE-2 | ROUGE-L | ROUGE-Lsum | Exact Match | BLEU | Mean Generated Length | F1 |
100
+ |:-------------:|:-----:|:----:|:---------------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:------------:|:------------:|:------------:|:------------:|:---------------:|:-------------:|:----------------:|:-------:|:-------:|:-------:|:----------:|:-----------:|:-------:|:---------------------:|:------:|
101
+ | 8.9608 | 0.99 | 36 | 2.8883 | 2306 | 50 | 12 | 2 | 17876 | 15672 | 13468 | 11264 | 12.9 | 0.319 | 0.0891 | 0.0178 | 0.828 | 17876 | 21250 | 0.0081 | 0.0022 | 0.0078 | 0.0078 | 0.0 | 0.2352 | 3.1969 | 0.0092 |
102
+ | 3.2364 | 1.98 | 72 | 1.9242 | 6125 | 1727 | 687 | 277 | 21152 | 18948 | 16744 | 14540 | 28.9571 | 9.1144 | 4.103 | 1.9051 | 0.9954 | 21152 | 21250 | 0.2457 | 0.1026 | 0.2345 | 0.2346 | 0.0018 | 6.7083 | 11.8072 | 0.2514 |
103
+ | 2.4963 | 3.0 | 109 | 1.6558 | 6903 | 2271 | 975 | 409 | 16537 | 14333 | 12129 | 9925 | 41.7428 | 15.8446 | 8.0386 | 4.1209 | 0.752 | 16537 | 21250 | 0.2966 | 0.1415 | 0.2854 | 0.2852 | 0.01 | 9.1493 | 12.176 | 0.2909 |
104
+ | 2.2314 | 3.98 | 145 | 1.5771 | 7160 | 2440 | 1098 | 501 | 16627 | 14423 | 12219 | 10015 | 43.0625 | 16.9174 | 8.986 | 5.0025 | 0.7573 | 16627 | 21250 | 0.314 | 0.1535 | 0.3028 | 0.3028 | 0.0136 | 10.187 | 12.157 | 0.3069 |
105
+ | 2.0578 | 4.97 | 181 | 1.5347 | 7447 | 2625 | 1214 | 566 | 17305 | 15101 | 12897 | 10693 | 43.0338 | 17.383 | 9.413 | 5.2932 | 0.7961 | 17305 | 21250 | 0.3286 | 0.1628 | 0.3146 | 0.3146 | 0.0163 | 11.0621 | 12.5585 | 0.32 |
106
+ | 1.8928 | 5.99 | 218 | 1.5128 | 7396 | 2659 | 1257 | 611 | 16598 | 14394 | 12190 | 9986 | 44.5596 | 18.473 | 10.3117 | 6.1186 | 0.7556 | 16598 | 21250 | 0.3326 | 0.1684 | 0.3198 | 0.3198 | 0.0177 | 11.4063 | 12.1692 | 0.3234 |
107
+ | 1.8573 | 6.98 | 254 | 1.4736 | 7531 | 2758 | 1313 | 641 | 16728 | 14524 | 12320 | 10116 | 45.0203 | 18.9893 | 10.6575 | 6.3365 | 0.7631 | 16728 | 21250 | 0.3349 | 0.1717 | 0.3216 | 0.3216 | 0.0163 | 11.8292 | 12.3035 | 0.327 |
108
+ | 1.7361 | 8.0 | 291 | 1.4544 | 7658 | 2849 | 1368 | 668 | 16928 | 14724 | 12520 | 10316 | 45.2387 | 19.3494 | 10.9265 | 6.4754 | 0.7747 | 16928 | 21250 | 0.3414 | 0.1762 | 0.3283 | 0.3284 | 0.0181 | 12.2208 | 12.4628 | 0.3334 |
109
+ | 1.7162 | 8.99 | 327 | 1.4459 | 7703 | 2891 | 1390 | 694 | 16795 | 14591 | 12387 | 10183 | 45.8648 | 19.8136 | 11.2214 | 6.8153 | 0.767 | 16795 | 21250 | 0.3454 | 0.1785 | 0.3325 | 0.3323 | 0.0159 | 12.4536 | 12.4174 | 0.3374 |
110
+ | 1.6589 | 9.98 | 363 | 1.4383 | 7889 | 2983 | 1449 | 719 | 17376 | 15172 | 12968 | 10764 | 45.4017 | 19.6612 | 11.1737 | 6.6797 | 0.8002 | 17376 | 21250 | 0.3519 | 0.1816 | 0.3375 | 0.3372 | 0.0172 | 12.8553 | 12.7101 | 0.3435 |
111
+ | 1.5571 | 10.99 | 400 | 1.4214 | 7889 | 2994 | 1457 | 736 | 17185 | 14981 | 12777 | 10573 | 45.9063 | 19.9853 | 11.4033 | 6.9611 | 0.7894 | 17185 | 21250 | 0.3529 | 0.1845 | 0.3392 | 0.3393 | 0.02 | 12.9671 | 12.6466 | 0.3457 |
112
+ | 1.5502 | 11.98 | 436 | 1.4135 | 7930 | 3008 | 1477 | 741 | 16868 | 14664 | 12460 | 10256 | 47.0121 | 20.5128 | 11.8539 | 7.225 | 0.7712 | 16868 | 21250 | 0.3619 | 0.189 | 0.3492 | 0.3491 | 0.0213 | 13.0741 | 12.4483 | 0.3541 |
113
+ | 1.4564 | 13.0 | 473 | 1.3943 | 8268 | 3200 | 1616 | 837 | 17929 | 15725 | 13521 | 11317 | 46.1152 | 20.3498 | 11.9518 | 7.396 | 0.8309 | 17929 | 21250 | 0.3729 | 0.1974 | 0.3578 | 0.3576 | 0.0218 | 14.1014 | 13.2441 | 0.3647 |
114
+ | 1.4522 | 13.99 | 509 | 1.3953 | 8047 | 3130 | 1564 | 811 | 16789 | 14585 | 12381 | 10177 | 47.9302 | 21.4604 | 12.6323 | 7.9689 | 0.7667 | 16789 | 21250 | 0.3712 | 0.197 | 0.3582 | 0.3581 | 0.0227 | 13.7526 | 12.515 | 0.3627 |
115
+ | 1.407 | 14.98 | 545 | 1.3759 | 8498 | 3358 | 1703 | 877 | 17923 | 15719 | 13515 | 11311 | 47.4139 | 21.3627 | 12.6008 | 7.7535 | 0.8306 | 17923 | 21250 | 0.3856 | 0.2063 | 0.3709 | 0.3706 | 0.0213 | 14.7315 | 13.2849 | 0.3772 |
116
+ | 1.3294 | 15.99 | 582 | 1.3776 | 8481 | 3407 | 1721 | 883 | 17451 | 15247 | 13043 | 10839 | 48.5989 | 22.3454 | 13.1948 | 8.1465 | 0.8044 | 17451 | 21250 | 0.3907 | 0.211 | 0.3766 | 0.3766 | 0.024 | 14.868 | 12.9142 | 0.3822 |
117
+ | 1.3294 | 16.98 | 618 | 1.3803 | 8633 | 3464 | 1767 | 923 | 18004 | 15800 | 13596 | 11392 | 47.9505 | 21.9241 | 12.9965 | 8.1022 | 0.835 | 18004 | 21250 | 0.3946 | 0.2133 | 0.3801 | 0.3798 | 0.0263 | 15.2312 | 13.3103 | 0.3868 |
118
+ | 1.2605 | 18.0 | 655 | 1.3710 | 8560 | 3376 | 1695 | 880 | 17830 | 15626 | 13422 | 11218 | 48.009 | 21.605 | 12.6285 | 7.8445 | 0.8255 | 17830 | 21250 | 0.3922 | 0.2092 | 0.3778 | 0.3775 | 0.0231 | 14.779 | 13.1665 | 0.3846 |
119
+ | 1.2667 | 18.99 | 691 | 1.3694 | 8664 | 3455 | 1733 | 882 | 17834 | 15630 | 13426 | 11222 | 48.5814 | 22.1049 | 12.9078 | 7.8596 | 0.8257 | 17834 | 21250 | 0.3987 | 0.2138 | 0.3853 | 0.3851 | 0.0227 | 15.0008 | 13.2232 | 0.3906 |
120
+ | 1.2074 | 19.79 | 720 | 1.3658 | 8770 | 3465 | 1737 | 880 | 18039 | 15835 | 13631 | 11427 | 48.6169 | 21.8819 | 12.743 | 7.7011 | 0.8369 | 18039 | 21250 | 0.4025 | 0.215 | 0.3883 | 0.3879 | 0.0227 | 15.0442 | 13.4424 | 0.3941 |
121
+
122
+
123
+ ### Framework versions
124
+
125
+ - Transformers 4.32.1
126
+ - Pytorch 2.1.0
127
+ - Datasets 2.12.0
128
+ - Tokenizers 0.13.3
added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<hl>": 250100
3
+ }
all_results.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 19.79,
3
+ "eval_bleu": 15.0442,
4
+ "eval_bp": 0.8369,
5
+ "eval_counts_1": 8770,
6
+ "eval_counts_2": 3465,
7
+ "eval_counts_3": 1737,
8
+ "eval_counts_4": 880,
9
+ "eval_exact_match": 0.0227,
10
+ "eval_f1": 0.3941,
11
+ "eval_gen_len": 13.4424,
12
+ "eval_loss": 1.365785837173462,
13
+ "eval_precisions_1": 48.6169,
14
+ "eval_precisions_2": 21.8819,
15
+ "eval_precisions_3": 12.743,
16
+ "eval_precisions_4": 7.7011,
17
+ "eval_ref_len": 21250,
18
+ "eval_rouge1": 0.4025,
19
+ "eval_rouge2": 0.215,
20
+ "eval_rougeL": 0.3883,
21
+ "eval_rougeLsum": 0.3879,
22
+ "eval_runtime": 459.1599,
23
+ "eval_samples": 2204,
24
+ "eval_samples_per_second": 4.8,
25
+ "eval_steps_per_second": 1.2,
26
+ "eval_sys_len": 18039,
27
+ "eval_totals_1": 18039,
28
+ "eval_totals_2": 15835,
29
+ "eval_totals_3": 13631,
30
+ "eval_totals_4": 11427,
31
+ "predict_bleu": 13.4665,
32
+ "predict_bp": 0.8682,
33
+ "predict_counts_1": 8267,
34
+ "predict_counts_2": 3066,
35
+ "predict_counts_3": 1475,
36
+ "predict_counts_4": 724,
37
+ "predict_exact_match": 0.0281,
38
+ "predict_f1": 0.37,
39
+ "predict_gen_len": 13.7772,
40
+ "predict_loss": 1.5614243745803833,
41
+ "predict_precisions_1": 45.3757,
42
+ "predict_precisions_2": 19.1446,
43
+ "predict_precisions_3": 10.6799,
44
+ "predict_precisions_4": 6.2376,
45
+ "predict_ref_len": 20793,
46
+ "predict_rouge1": 0.3804,
47
+ "predict_rouge2": 0.1994,
48
+ "predict_rougeL": 0.3664,
49
+ "predict_rougeLsum": 0.3665,
50
+ "predict_runtime": 461.6889,
51
+ "predict_samples": 2204,
52
+ "predict_samples_per_second": 4.774,
53
+ "predict_steps_per_second": 1.193,
54
+ "predict_sys_len": 18219,
55
+ "predict_totals_1": 18219,
56
+ "predict_totals_2": 16015,
57
+ "predict_totals_3": 13811,
58
+ "predict_totals_4": 11607,
59
+ "train_loss": 2.0875697082943385,
60
+ "train_runtime": 23544.6757,
61
+ "train_samples": 9314,
62
+ "train_samples_per_second": 7.912,
63
+ "train_steps_per_second": 0.031
64
+ }
config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/mt5-base",
3
+ "architectures": [
4
+ "MT5ForConditionalGeneration"
5
+ ],
6
+ "classifier_dropout": 0.0,
7
+ "d_ff": 2048,
8
+ "d_kv": 64,
9
+ "d_model": 768,
10
+ "decoder_start_token_id": 0,
11
+ "dense_act_fn": "gelu_new",
12
+ "dropout_rate": 0.1,
13
+ "eos_token_id": 1,
14
+ "feed_forward_proj": "gated-gelu",
15
+ "initializer_factor": 1.0,
16
+ "is_encoder_decoder": true,
17
+ "is_gated_act": true,
18
+ "layer_norm_epsilon": 1e-06,
19
+ "length_penalty": 0.0,
20
+ "max_length": 64,
21
+ "model_type": "mt5",
22
+ "num_beams": 4,
23
+ "num_decoder_layers": 12,
24
+ "num_heads": 12,
25
+ "num_layers": 12,
26
+ "output_past": true,
27
+ "pad_token_id": 0,
28
+ "relative_attention_max_distance": 128,
29
+ "relative_attention_num_buckets": 32,
30
+ "tie_word_embeddings": false,
31
+ "tokenizer_class": "T5Tokenizer",
32
+ "torch_dtype": "float32",
33
+ "transformers_version": "4.32.1",
34
+ "use_cache": true,
35
+ "vocab_size": 250112
36
+ }
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "decoder_start_token_id": 0,
3
+ "eos_token_id": 1,
4
+ "length_penalty": 0.0,
5
+ "max_length": 64,
6
+ "num_beams": 4,
7
+ "pad_token_id": 0,
8
+ "transformers_version": "4.32.1"
9
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0154c470a501e9cd0362780d5a5221550c7a9cd47d8cd39683a81edb84cfde9
3
+ size 2329638768
special_tokens_map.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<hl>"
4
+ ],
5
+ "eos_token": "</s>",
6
+ "pad_token": "<pad>",
7
+ "unk_token": "<unk>"
8
+ }
spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef78f86560d809067d12bac6c09f19a462cb3af3f54d2b8acbba26e1433125d6
3
+ size 4309802
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c58c3dc929366af7c460d31895a225edc5077f5fb4438735a9896a78ab9842d7
3
+ size 16330813
tokenizer_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": null,
3
+ "clean_up_tokenization_spaces": true,
4
+ "eos_token": "</s>",
5
+ "extra_ids": 0,
6
+ "legacy": true,
7
+ "model_max_length": 1000000000000000019884624838656,
8
+ "pad_token": "<pad>",
9
+ "sp_model_kwargs": {},
10
+ "tokenizer_class": "T5Tokenizer",
11
+ "unk_token": "<unk>"
12
+ }
trainer_state.json ADDED
@@ -0,0 +1,768 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 19.78531558608845,
5
+ "eval_steps": 500,
6
+ "global_step": 720,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.99,
13
+ "learning_rate": 0.0001,
14
+ "loss": 8.9608,
15
+ "step": 36
16
+ },
17
+ {
18
+ "epoch": 0.99,
19
+ "eval_bleu": 0.2352,
20
+ "eval_bp": 0.828,
21
+ "eval_counts_1": 2306,
22
+ "eval_counts_2": 50,
23
+ "eval_counts_3": 12,
24
+ "eval_counts_4": 2,
25
+ "eval_exact_match": 0.0,
26
+ "eval_f1": 0.0092,
27
+ "eval_gen_len": 3.1969,
28
+ "eval_loss": 2.8882896900177,
29
+ "eval_precisions_1": 12.9,
30
+ "eval_precisions_2": 0.319,
31
+ "eval_precisions_3": 0.0891,
32
+ "eval_precisions_4": 0.0178,
33
+ "eval_ref_len": 21250,
34
+ "eval_rouge1": 0.0081,
35
+ "eval_rouge2": 0.0022,
36
+ "eval_rougeL": 0.0078,
37
+ "eval_rougeLsum": 0.0078,
38
+ "eval_runtime": 386.3015,
39
+ "eval_samples_per_second": 5.705,
40
+ "eval_steps_per_second": 1.426,
41
+ "eval_sys_len": 17876,
42
+ "eval_totals_1": 17876,
43
+ "eval_totals_2": 15672,
44
+ "eval_totals_3": 13468,
45
+ "eval_totals_4": 11264,
46
+ "step": 36
47
+ },
48
+ {
49
+ "epoch": 1.98,
50
+ "learning_rate": 0.0001,
51
+ "loss": 3.2364,
52
+ "step": 72
53
+ },
54
+ {
55
+ "epoch": 1.98,
56
+ "eval_bleu": 6.7083,
57
+ "eval_bp": 0.9954,
58
+ "eval_counts_1": 6125,
59
+ "eval_counts_2": 1727,
60
+ "eval_counts_3": 687,
61
+ "eval_counts_4": 277,
62
+ "eval_exact_match": 0.0018,
63
+ "eval_f1": 0.2514,
64
+ "eval_gen_len": 11.8072,
65
+ "eval_loss": 1.9241770505905151,
66
+ "eval_precisions_1": 28.9571,
67
+ "eval_precisions_2": 9.1144,
68
+ "eval_precisions_3": 4.103,
69
+ "eval_precisions_4": 1.9051,
70
+ "eval_ref_len": 21250,
71
+ "eval_rouge1": 0.2457,
72
+ "eval_rouge2": 0.1026,
73
+ "eval_rougeL": 0.2345,
74
+ "eval_rougeLsum": 0.2346,
75
+ "eval_runtime": 440.0537,
76
+ "eval_samples_per_second": 5.008,
77
+ "eval_steps_per_second": 1.252,
78
+ "eval_sys_len": 21152,
79
+ "eval_totals_1": 21152,
80
+ "eval_totals_2": 18948,
81
+ "eval_totals_3": 16744,
82
+ "eval_totals_4": 14540,
83
+ "step": 72
84
+ },
85
+ {
86
+ "epoch": 3.0,
87
+ "learning_rate": 0.0001,
88
+ "loss": 2.4963,
89
+ "step": 109
90
+ },
91
+ {
92
+ "epoch": 3.0,
93
+ "eval_bleu": 9.1493,
94
+ "eval_bp": 0.752,
95
+ "eval_counts_1": 6903,
96
+ "eval_counts_2": 2271,
97
+ "eval_counts_3": 975,
98
+ "eval_counts_4": 409,
99
+ "eval_exact_match": 0.01,
100
+ "eval_f1": 0.2909,
101
+ "eval_gen_len": 12.176,
102
+ "eval_loss": 1.6558014154434204,
103
+ "eval_precisions_1": 41.7428,
104
+ "eval_precisions_2": 15.8446,
105
+ "eval_precisions_3": 8.0386,
106
+ "eval_precisions_4": 4.1209,
107
+ "eval_ref_len": 21250,
108
+ "eval_rouge1": 0.2966,
109
+ "eval_rouge2": 0.1415,
110
+ "eval_rougeL": 0.2854,
111
+ "eval_rougeLsum": 0.2852,
112
+ "eval_runtime": 434.1741,
113
+ "eval_samples_per_second": 5.076,
114
+ "eval_steps_per_second": 1.269,
115
+ "eval_sys_len": 16537,
116
+ "eval_totals_1": 16537,
117
+ "eval_totals_2": 14333,
118
+ "eval_totals_3": 12129,
119
+ "eval_totals_4": 9925,
120
+ "step": 109
121
+ },
122
+ {
123
+ "epoch": 3.98,
124
+ "learning_rate": 0.0001,
125
+ "loss": 2.2314,
126
+ "step": 145
127
+ },
128
+ {
129
+ "epoch": 3.98,
130
+ "eval_bleu": 10.187,
131
+ "eval_bp": 0.7573,
132
+ "eval_counts_1": 7160,
133
+ "eval_counts_2": 2440,
134
+ "eval_counts_3": 1098,
135
+ "eval_counts_4": 501,
136
+ "eval_exact_match": 0.0136,
137
+ "eval_f1": 0.3069,
138
+ "eval_gen_len": 12.157,
139
+ "eval_loss": 1.5771422386169434,
140
+ "eval_precisions_1": 43.0625,
141
+ "eval_precisions_2": 16.9174,
142
+ "eval_precisions_3": 8.986,
143
+ "eval_precisions_4": 5.0025,
144
+ "eval_ref_len": 21250,
145
+ "eval_rouge1": 0.314,
146
+ "eval_rouge2": 0.1535,
147
+ "eval_rougeL": 0.3028,
148
+ "eval_rougeLsum": 0.3028,
149
+ "eval_runtime": 436.5308,
150
+ "eval_samples_per_second": 5.049,
151
+ "eval_steps_per_second": 1.262,
152
+ "eval_sys_len": 16627,
153
+ "eval_totals_1": 16627,
154
+ "eval_totals_2": 14423,
155
+ "eval_totals_3": 12219,
156
+ "eval_totals_4": 10015,
157
+ "step": 145
158
+ },
159
+ {
160
+ "epoch": 4.97,
161
+ "learning_rate": 0.0001,
162
+ "loss": 2.0578,
163
+ "step": 181
164
+ },
165
+ {
166
+ "epoch": 4.97,
167
+ "eval_bleu": 11.0621,
168
+ "eval_bp": 0.7961,
169
+ "eval_counts_1": 7447,
170
+ "eval_counts_2": 2625,
171
+ "eval_counts_3": 1214,
172
+ "eval_counts_4": 566,
173
+ "eval_exact_match": 0.0163,
174
+ "eval_f1": 0.32,
175
+ "eval_gen_len": 12.5585,
176
+ "eval_loss": 1.5346813201904297,
177
+ "eval_precisions_1": 43.0338,
178
+ "eval_precisions_2": 17.383,
179
+ "eval_precisions_3": 9.413,
180
+ "eval_precisions_4": 5.2932,
181
+ "eval_ref_len": 21250,
182
+ "eval_rouge1": 0.3286,
183
+ "eval_rouge2": 0.1628,
184
+ "eval_rougeL": 0.3146,
185
+ "eval_rougeLsum": 0.3146,
186
+ "eval_runtime": 444.2911,
187
+ "eval_samples_per_second": 4.961,
188
+ "eval_steps_per_second": 1.24,
189
+ "eval_sys_len": 17305,
190
+ "eval_totals_1": 17305,
191
+ "eval_totals_2": 15101,
192
+ "eval_totals_3": 12897,
193
+ "eval_totals_4": 10693,
194
+ "step": 181
195
+ },
196
+ {
197
+ "epoch": 5.99,
198
+ "learning_rate": 0.0001,
199
+ "loss": 1.8928,
200
+ "step": 218
201
+ },
202
+ {
203
+ "epoch": 5.99,
204
+ "eval_bleu": 11.4063,
205
+ "eval_bp": 0.7556,
206
+ "eval_counts_1": 7396,
207
+ "eval_counts_2": 2659,
208
+ "eval_counts_3": 1257,
209
+ "eval_counts_4": 611,
210
+ "eval_exact_match": 0.0177,
211
+ "eval_f1": 0.3234,
212
+ "eval_gen_len": 12.1692,
213
+ "eval_loss": 1.512817144393921,
214
+ "eval_precisions_1": 44.5596,
215
+ "eval_precisions_2": 18.473,
216
+ "eval_precisions_3": 10.3117,
217
+ "eval_precisions_4": 6.1186,
218
+ "eval_ref_len": 21250,
219
+ "eval_rouge1": 0.3326,
220
+ "eval_rouge2": 0.1684,
221
+ "eval_rougeL": 0.3198,
222
+ "eval_rougeLsum": 0.3198,
223
+ "eval_runtime": 441.07,
224
+ "eval_samples_per_second": 4.997,
225
+ "eval_steps_per_second": 1.249,
226
+ "eval_sys_len": 16598,
227
+ "eval_totals_1": 16598,
228
+ "eval_totals_2": 14394,
229
+ "eval_totals_3": 12190,
230
+ "eval_totals_4": 9986,
231
+ "step": 218
232
+ },
233
+ {
234
+ "epoch": 6.98,
235
+ "learning_rate": 0.0001,
236
+ "loss": 1.8573,
237
+ "step": 254
238
+ },
239
+ {
240
+ "epoch": 6.98,
241
+ "eval_bleu": 11.8292,
242
+ "eval_bp": 0.7631,
243
+ "eval_counts_1": 7531,
244
+ "eval_counts_2": 2758,
245
+ "eval_counts_3": 1313,
246
+ "eval_counts_4": 641,
247
+ "eval_exact_match": 0.0163,
248
+ "eval_f1": 0.327,
249
+ "eval_gen_len": 12.3035,
250
+ "eval_loss": 1.4735780954360962,
251
+ "eval_precisions_1": 45.0203,
252
+ "eval_precisions_2": 18.9893,
253
+ "eval_precisions_3": 10.6575,
254
+ "eval_precisions_4": 6.3365,
255
+ "eval_ref_len": 21250,
256
+ "eval_rouge1": 0.3349,
257
+ "eval_rouge2": 0.1717,
258
+ "eval_rougeL": 0.3216,
259
+ "eval_rougeLsum": 0.3216,
260
+ "eval_runtime": 442.6304,
261
+ "eval_samples_per_second": 4.979,
262
+ "eval_steps_per_second": 1.245,
263
+ "eval_sys_len": 16728,
264
+ "eval_totals_1": 16728,
265
+ "eval_totals_2": 14524,
266
+ "eval_totals_3": 12320,
267
+ "eval_totals_4": 10116,
268
+ "step": 254
269
+ },
270
+ {
271
+ "epoch": 8.0,
272
+ "learning_rate": 0.0001,
273
+ "loss": 1.7361,
274
+ "step": 291
275
+ },
276
+ {
277
+ "epoch": 8.0,
278
+ "eval_bleu": 12.2208,
279
+ "eval_bp": 0.7747,
280
+ "eval_counts_1": 7658,
281
+ "eval_counts_2": 2849,
282
+ "eval_counts_3": 1368,
283
+ "eval_counts_4": 668,
284
+ "eval_exact_match": 0.0181,
285
+ "eval_f1": 0.3334,
286
+ "eval_gen_len": 12.4628,
287
+ "eval_loss": 1.4544174671173096,
288
+ "eval_precisions_1": 45.2387,
289
+ "eval_precisions_2": 19.3494,
290
+ "eval_precisions_3": 10.9265,
291
+ "eval_precisions_4": 6.4754,
292
+ "eval_ref_len": 21250,
293
+ "eval_rouge1": 0.3414,
294
+ "eval_rouge2": 0.1762,
295
+ "eval_rougeL": 0.3283,
296
+ "eval_rougeLsum": 0.3284,
297
+ "eval_runtime": 442.3648,
298
+ "eval_samples_per_second": 4.982,
299
+ "eval_steps_per_second": 1.246,
300
+ "eval_sys_len": 16928,
301
+ "eval_totals_1": 16928,
302
+ "eval_totals_2": 14724,
303
+ "eval_totals_3": 12520,
304
+ "eval_totals_4": 10316,
305
+ "step": 291
306
+ },
307
+ {
308
+ "epoch": 8.99,
309
+ "learning_rate": 0.0001,
310
+ "loss": 1.7162,
311
+ "step": 327
312
+ },
313
+ {
314
+ "epoch": 8.99,
315
+ "eval_bleu": 12.4536,
316
+ "eval_bp": 0.767,
317
+ "eval_counts_1": 7703,
318
+ "eval_counts_2": 2891,
319
+ "eval_counts_3": 1390,
320
+ "eval_counts_4": 694,
321
+ "eval_exact_match": 0.0159,
322
+ "eval_f1": 0.3374,
323
+ "eval_gen_len": 12.4174,
324
+ "eval_loss": 1.4459445476531982,
325
+ "eval_precisions_1": 45.8648,
326
+ "eval_precisions_2": 19.8136,
327
+ "eval_precisions_3": 11.2214,
328
+ "eval_precisions_4": 6.8153,
329
+ "eval_ref_len": 21250,
330
+ "eval_rouge1": 0.3454,
331
+ "eval_rouge2": 0.1785,
332
+ "eval_rougeL": 0.3325,
333
+ "eval_rougeLsum": 0.3323,
334
+ "eval_runtime": 436.4836,
335
+ "eval_samples_per_second": 5.049,
336
+ "eval_steps_per_second": 1.262,
337
+ "eval_sys_len": 16795,
338
+ "eval_totals_1": 16795,
339
+ "eval_totals_2": 14591,
340
+ "eval_totals_3": 12387,
341
+ "eval_totals_4": 10183,
342
+ "step": 327
343
+ },
344
+ {
345
+ "epoch": 9.98,
346
+ "learning_rate": 0.0001,
347
+ "loss": 1.6589,
348
+ "step": 363
349
+ },
350
+ {
351
+ "epoch": 9.98,
352
+ "eval_bleu": 12.8553,
353
+ "eval_bp": 0.8002,
354
+ "eval_counts_1": 7889,
355
+ "eval_counts_2": 2983,
356
+ "eval_counts_3": 1449,
357
+ "eval_counts_4": 719,
358
+ "eval_exact_match": 0.0172,
359
+ "eval_f1": 0.3435,
360
+ "eval_gen_len": 12.7101,
361
+ "eval_loss": 1.438312292098999,
362
+ "eval_precisions_1": 45.4017,
363
+ "eval_precisions_2": 19.6612,
364
+ "eval_precisions_3": 11.1737,
365
+ "eval_precisions_4": 6.6797,
366
+ "eval_ref_len": 21250,
367
+ "eval_rouge1": 0.3519,
368
+ "eval_rouge2": 0.1816,
369
+ "eval_rougeL": 0.3375,
370
+ "eval_rougeLsum": 0.3372,
371
+ "eval_runtime": 449.3427,
372
+ "eval_samples_per_second": 4.905,
373
+ "eval_steps_per_second": 1.226,
374
+ "eval_sys_len": 17376,
375
+ "eval_totals_1": 17376,
376
+ "eval_totals_2": 15172,
377
+ "eval_totals_3": 12968,
378
+ "eval_totals_4": 10764,
379
+ "step": 363
380
+ },
381
+ {
382
+ "epoch": 10.99,
383
+ "learning_rate": 0.0001,
384
+ "loss": 1.5571,
385
+ "step": 400
386
+ },
387
+ {
388
+ "epoch": 10.99,
389
+ "eval_bleu": 12.9671,
390
+ "eval_bp": 0.7894,
391
+ "eval_counts_1": 7889,
392
+ "eval_counts_2": 2994,
393
+ "eval_counts_3": 1457,
394
+ "eval_counts_4": 736,
395
+ "eval_exact_match": 0.02,
396
+ "eval_f1": 0.3457,
397
+ "eval_gen_len": 12.6466,
398
+ "eval_loss": 1.4213731288909912,
399
+ "eval_precisions_1": 45.9063,
400
+ "eval_precisions_2": 19.9853,
401
+ "eval_precisions_3": 11.4033,
402
+ "eval_precisions_4": 6.9611,
403
+ "eval_ref_len": 21250,
404
+ "eval_rouge1": 0.3529,
405
+ "eval_rouge2": 0.1845,
406
+ "eval_rougeL": 0.3392,
407
+ "eval_rougeLsum": 0.3393,
408
+ "eval_runtime": 440.5687,
409
+ "eval_samples_per_second": 5.003,
410
+ "eval_steps_per_second": 1.251,
411
+ "eval_sys_len": 17185,
412
+ "eval_totals_1": 17185,
413
+ "eval_totals_2": 14981,
414
+ "eval_totals_3": 12777,
415
+ "eval_totals_4": 10573,
416
+ "step": 400
417
+ },
418
+ {
419
+ "epoch": 11.98,
420
+ "learning_rate": 0.0001,
421
+ "loss": 1.5502,
422
+ "step": 436
423
+ },
424
+ {
425
+ "epoch": 11.98,
426
+ "eval_bleu": 13.0741,
427
+ "eval_bp": 0.7712,
428
+ "eval_counts_1": 7930,
429
+ "eval_counts_2": 3008,
430
+ "eval_counts_3": 1477,
431
+ "eval_counts_4": 741,
432
+ "eval_exact_match": 0.0213,
433
+ "eval_f1": 0.3541,
434
+ "eval_gen_len": 12.4483,
435
+ "eval_loss": 1.4135174751281738,
436
+ "eval_precisions_1": 47.0121,
437
+ "eval_precisions_2": 20.5128,
438
+ "eval_precisions_3": 11.8539,
439
+ "eval_precisions_4": 7.225,
440
+ "eval_ref_len": 21250,
441
+ "eval_rouge1": 0.3619,
442
+ "eval_rouge2": 0.189,
443
+ "eval_rougeL": 0.3492,
444
+ "eval_rougeLsum": 0.3491,
445
+ "eval_runtime": 443.1145,
446
+ "eval_samples_per_second": 4.974,
447
+ "eval_steps_per_second": 1.243,
448
+ "eval_sys_len": 16868,
449
+ "eval_totals_1": 16868,
450
+ "eval_totals_2": 14664,
451
+ "eval_totals_3": 12460,
452
+ "eval_totals_4": 10256,
453
+ "step": 436
454
+ },
455
+ {
456
+ "epoch": 13.0,
457
+ "learning_rate": 0.0001,
458
+ "loss": 1.4564,
459
+ "step": 473
460
+ },
461
+ {
462
+ "epoch": 13.0,
463
+ "eval_bleu": 14.1014,
464
+ "eval_bp": 0.8309,
465
+ "eval_counts_1": 8268,
466
+ "eval_counts_2": 3200,
467
+ "eval_counts_3": 1616,
468
+ "eval_counts_4": 837,
469
+ "eval_exact_match": 0.0218,
470
+ "eval_f1": 0.3647,
471
+ "eval_gen_len": 13.2441,
472
+ "eval_loss": 1.3942722082138062,
473
+ "eval_precisions_1": 46.1152,
474
+ "eval_precisions_2": 20.3498,
475
+ "eval_precisions_3": 11.9518,
476
+ "eval_precisions_4": 7.396,
477
+ "eval_ref_len": 21250,
478
+ "eval_rouge1": 0.3729,
479
+ "eval_rouge2": 0.1974,
480
+ "eval_rougeL": 0.3578,
481
+ "eval_rougeLsum": 0.3576,
482
+ "eval_runtime": 460.2282,
483
+ "eval_samples_per_second": 4.789,
484
+ "eval_steps_per_second": 1.197,
485
+ "eval_sys_len": 17929,
486
+ "eval_totals_1": 17929,
487
+ "eval_totals_2": 15725,
488
+ "eval_totals_3": 13521,
489
+ "eval_totals_4": 11317,
490
+ "step": 473
491
+ },
492
+ {
493
+ "epoch": 13.99,
494
+ "learning_rate": 0.0001,
495
+ "loss": 1.4522,
496
+ "step": 509
497
+ },
498
+ {
499
+ "epoch": 13.99,
500
+ "eval_bleu": 13.7526,
501
+ "eval_bp": 0.7667,
502
+ "eval_counts_1": 8047,
503
+ "eval_counts_2": 3130,
504
+ "eval_counts_3": 1564,
505
+ "eval_counts_4": 811,
506
+ "eval_exact_match": 0.0227,
507
+ "eval_f1": 0.3627,
508
+ "eval_gen_len": 12.515,
509
+ "eval_loss": 1.3952871561050415,
510
+ "eval_precisions_1": 47.9302,
511
+ "eval_precisions_2": 21.4604,
512
+ "eval_precisions_3": 12.6323,
513
+ "eval_precisions_4": 7.9689,
514
+ "eval_ref_len": 21250,
515
+ "eval_rouge1": 0.3712,
516
+ "eval_rouge2": 0.197,
517
+ "eval_rougeL": 0.3582,
518
+ "eval_rougeLsum": 0.3581,
519
+ "eval_runtime": 437.5396,
520
+ "eval_samples_per_second": 5.037,
521
+ "eval_steps_per_second": 1.259,
522
+ "eval_sys_len": 16789,
523
+ "eval_totals_1": 16789,
524
+ "eval_totals_2": 14585,
525
+ "eval_totals_3": 12381,
526
+ "eval_totals_4": 10177,
527
+ "step": 509
528
+ },
529
+ {
530
+ "epoch": 14.98,
531
+ "learning_rate": 0.0001,
532
+ "loss": 1.407,
533
+ "step": 545
534
+ },
535
+ {
536
+ "epoch": 14.98,
537
+ "eval_bleu": 14.7315,
538
+ "eval_bp": 0.8306,
539
+ "eval_counts_1": 8498,
540
+ "eval_counts_2": 3358,
541
+ "eval_counts_3": 1703,
542
+ "eval_counts_4": 877,
543
+ "eval_exact_match": 0.0213,
544
+ "eval_f1": 0.3772,
545
+ "eval_gen_len": 13.2849,
546
+ "eval_loss": 1.3759350776672363,
547
+ "eval_precisions_1": 47.4139,
548
+ "eval_precisions_2": 21.3627,
549
+ "eval_precisions_3": 12.6008,
550
+ "eval_precisions_4": 7.7535,
551
+ "eval_ref_len": 21250,
552
+ "eval_rouge1": 0.3856,
553
+ "eval_rouge2": 0.2063,
554
+ "eval_rougeL": 0.3709,
555
+ "eval_rougeLsum": 0.3706,
556
+ "eval_runtime": 453.6157,
557
+ "eval_samples_per_second": 4.859,
558
+ "eval_steps_per_second": 1.215,
559
+ "eval_sys_len": 17923,
560
+ "eval_totals_1": 17923,
561
+ "eval_totals_2": 15719,
562
+ "eval_totals_3": 13515,
563
+ "eval_totals_4": 11311,
564
+ "step": 545
565
+ },
566
+ {
567
+ "epoch": 15.99,
568
+ "learning_rate": 0.0001,
569
+ "loss": 1.3294,
570
+ "step": 582
571
+ },
572
+ {
573
+ "epoch": 15.99,
574
+ "eval_bleu": 14.868,
575
+ "eval_bp": 0.8044,
576
+ "eval_counts_1": 8481,
577
+ "eval_counts_2": 3407,
578
+ "eval_counts_3": 1721,
579
+ "eval_counts_4": 883,
580
+ "eval_exact_match": 0.024,
581
+ "eval_f1": 0.3822,
582
+ "eval_gen_len": 12.9142,
583
+ "eval_loss": 1.3775662183761597,
584
+ "eval_precisions_1": 48.5989,
585
+ "eval_precisions_2": 22.3454,
586
+ "eval_precisions_3": 13.1948,
587
+ "eval_precisions_4": 8.1465,
588
+ "eval_ref_len": 21250,
589
+ "eval_rouge1": 0.3907,
590
+ "eval_rouge2": 0.211,
591
+ "eval_rougeL": 0.3766,
592
+ "eval_rougeLsum": 0.3766,
593
+ "eval_runtime": 448.6685,
594
+ "eval_samples_per_second": 4.912,
595
+ "eval_steps_per_second": 1.228,
596
+ "eval_sys_len": 17451,
597
+ "eval_totals_1": 17451,
598
+ "eval_totals_2": 15247,
599
+ "eval_totals_3": 13043,
600
+ "eval_totals_4": 10839,
601
+ "step": 582
602
+ },
603
+ {
604
+ "epoch": 16.98,
605
+ "learning_rate": 0.0001,
606
+ "loss": 1.3294,
607
+ "step": 618
608
+ },
609
+ {
610
+ "epoch": 16.98,
611
+ "eval_bleu": 15.2312,
612
+ "eval_bp": 0.835,
613
+ "eval_counts_1": 8633,
614
+ "eval_counts_2": 3464,
615
+ "eval_counts_3": 1767,
616
+ "eval_counts_4": 923,
617
+ "eval_exact_match": 0.0263,
618
+ "eval_f1": 0.3868,
619
+ "eval_gen_len": 13.3103,
620
+ "eval_loss": 1.380259394645691,
621
+ "eval_precisions_1": 47.9505,
622
+ "eval_precisions_2": 21.9241,
623
+ "eval_precisions_3": 12.9965,
624
+ "eval_precisions_4": 8.1022,
625
+ "eval_ref_len": 21250,
626
+ "eval_rouge1": 0.3946,
627
+ "eval_rouge2": 0.2133,
628
+ "eval_rougeL": 0.3801,
629
+ "eval_rougeLsum": 0.3798,
630
+ "eval_runtime": 456.612,
631
+ "eval_samples_per_second": 4.827,
632
+ "eval_steps_per_second": 1.207,
633
+ "eval_sys_len": 18004,
634
+ "eval_totals_1": 18004,
635
+ "eval_totals_2": 15800,
636
+ "eval_totals_3": 13596,
637
+ "eval_totals_4": 11392,
638
+ "step": 618
639
+ },
640
+ {
641
+ "epoch": 18.0,
642
+ "learning_rate": 0.0001,
643
+ "loss": 1.2605,
644
+ "step": 655
645
+ },
646
+ {
647
+ "epoch": 18.0,
648
+ "eval_bleu": 14.779,
649
+ "eval_bp": 0.8255,
650
+ "eval_counts_1": 8560,
651
+ "eval_counts_2": 3376,
652
+ "eval_counts_3": 1695,
653
+ "eval_counts_4": 880,
654
+ "eval_exact_match": 0.0231,
655
+ "eval_f1": 0.3846,
656
+ "eval_gen_len": 13.1665,
657
+ "eval_loss": 1.3709588050842285,
658
+ "eval_precisions_1": 48.009,
659
+ "eval_precisions_2": 21.605,
660
+ "eval_precisions_3": 12.6285,
661
+ "eval_precisions_4": 7.8445,
662
+ "eval_ref_len": 21250,
663
+ "eval_rouge1": 0.3922,
664
+ "eval_rouge2": 0.2092,
665
+ "eval_rougeL": 0.3778,
666
+ "eval_rougeLsum": 0.3775,
667
+ "eval_runtime": 456.164,
668
+ "eval_samples_per_second": 4.832,
669
+ "eval_steps_per_second": 1.208,
670
+ "eval_sys_len": 17830,
671
+ "eval_totals_1": 17830,
672
+ "eval_totals_2": 15626,
673
+ "eval_totals_3": 13422,
674
+ "eval_totals_4": 11218,
675
+ "step": 655
676
+ },
677
+ {
678
+ "epoch": 18.99,
679
+ "learning_rate": 0.0001,
680
+ "loss": 1.2667,
681
+ "step": 691
682
+ },
683
+ {
684
+ "epoch": 18.99,
685
+ "eval_bleu": 15.0008,
686
+ "eval_bp": 0.8257,
687
+ "eval_counts_1": 8664,
688
+ "eval_counts_2": 3455,
689
+ "eval_counts_3": 1733,
690
+ "eval_counts_4": 882,
691
+ "eval_exact_match": 0.0227,
692
+ "eval_f1": 0.3906,
693
+ "eval_gen_len": 13.2232,
694
+ "eval_loss": 1.3694192171096802,
695
+ "eval_precisions_1": 48.5814,
696
+ "eval_precisions_2": 22.1049,
697
+ "eval_precisions_3": 12.9078,
698
+ "eval_precisions_4": 7.8596,
699
+ "eval_ref_len": 21250,
700
+ "eval_rouge1": 0.3987,
701
+ "eval_rouge2": 0.2138,
702
+ "eval_rougeL": 0.3853,
703
+ "eval_rougeLsum": 0.3851,
704
+ "eval_runtime": 454.2362,
705
+ "eval_samples_per_second": 4.852,
706
+ "eval_steps_per_second": 1.213,
707
+ "eval_sys_len": 17834,
708
+ "eval_totals_1": 17834,
709
+ "eval_totals_2": 15630,
710
+ "eval_totals_3": 13426,
711
+ "eval_totals_4": 11222,
712
+ "step": 691
713
+ },
714
+ {
715
+ "epoch": 19.79,
716
+ "learning_rate": 0.0001,
717
+ "loss": 1.2074,
718
+ "step": 720
719
+ },
720
+ {
721
+ "epoch": 19.79,
722
+ "eval_bleu": 15.0442,
723
+ "eval_bp": 0.8369,
724
+ "eval_counts_1": 8770,
725
+ "eval_counts_2": 3465,
726
+ "eval_counts_3": 1737,
727
+ "eval_counts_4": 880,
728
+ "eval_exact_match": 0.0227,
729
+ "eval_f1": 0.3941,
730
+ "eval_gen_len": 13.4424,
731
+ "eval_loss": 1.365785837173462,
732
+ "eval_precisions_1": 48.6169,
733
+ "eval_precisions_2": 21.8819,
734
+ "eval_precisions_3": 12.743,
735
+ "eval_precisions_4": 7.7011,
736
+ "eval_ref_len": 21250,
737
+ "eval_rouge1": 0.4025,
738
+ "eval_rouge2": 0.215,
739
+ "eval_rougeL": 0.3883,
740
+ "eval_rougeLsum": 0.3879,
741
+ "eval_runtime": 459.1457,
742
+ "eval_samples_per_second": 4.8,
743
+ "eval_steps_per_second": 1.2,
744
+ "eval_sys_len": 18039,
745
+ "eval_totals_1": 18039,
746
+ "eval_totals_2": 15835,
747
+ "eval_totals_3": 13631,
748
+ "eval_totals_4": 11427,
749
+ "step": 720
750
+ },
751
+ {
752
+ "epoch": 19.79,
753
+ "step": 720,
754
+ "total_flos": 4.419252384883016e+17,
755
+ "train_loss": 2.0875697082943385,
756
+ "train_runtime": 23544.6757,
757
+ "train_samples_per_second": 7.912,
758
+ "train_steps_per_second": 0.031
759
+ }
760
+ ],
761
+ "logging_steps": 500,
762
+ "max_steps": 720,
763
+ "num_train_epochs": 20,
764
+ "save_steps": 500,
765
+ "total_flos": 4.419252384883016e+17,
766
+ "trial_name": null,
767
+ "trial_params": null
768
+ }
training_args.bin ADDED
Binary file (4.66 kB). View file