Marvin commited on
Commit
078ae09
0 Parent(s):

Initial commit

Browse files
.gitattributes ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
2
+ *.model filter=lfs diff=lfs merge=lfs -text
3
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
4
+ runs/** filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - de
4
+ tags:
5
+ - question-generation
6
+ - german
7
+ - text2text-generation
8
+ - generated_from_trainer
9
+ datasets:
10
+ - lmqg/qg_dequad
11
+ metrics:
12
+ - bleu4
13
+ - f1
14
+ - rouge
15
+ - exact_match
16
+ model-index:
17
+ - name: german-jeopardy-mt5-base-128
18
+ results:
19
+ - task:
20
+ name: Sequence-to-sequence Language Modeling
21
+ type: text2text-generation
22
+ dataset:
23
+ name: lmqg/qg_dequad
24
+ type: default
25
+ args: default
26
+ metrics:
27
+ - name: BLEU-4
28
+ type: bleu4
29
+ value: 14.62
30
+ - name: F1
31
+ type: f1
32
+ value: 39.47
33
+ - name: ROUGE-1
34
+ type: rouge1
35
+ value: 40.45
36
+ - name: ROUGE-2
37
+ type: rouge2
38
+ value: 21.49
39
+ - name: ROUGE-L
40
+ type: rougel
41
+ value: 39.02
42
+ - name: ROUGE-Lsum
43
+ type: rougelsum
44
+ value: 39.01
45
+ - name: Exact Match
46
+ type: exact_match
47
+ value: 2.68
48
+ ---
49
+
50
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
51
+ should probably proofread and complete it, then remove this comment. -->
52
+
53
+ # german-jeopardy-mt5-base-128
54
+
55
+ This model is a fine-tuned version of [google/mt5-base](https://huggingface.co/google/mt5-base) on the [lmqg/qg_dequad](https://huggingface.co/datasets/lmqg/qg_dequad) dataset.
56
+ It achieves the following results on the evaluation set:
57
+ - Loss: 1.56
58
+ - Brevity Penalty: 0.8709
59
+ - System Length: 18267
60
+ - Reference Length: 20793
61
+ - ROUGE-1: 40.45
62
+ - ROUGE-2: 21.49
63
+ - ROUGE-L: 39.02
64
+ - ROUGE-Lsum: 39.01
65
+ - Exact Match: 2.68
66
+ - BLEU: 14.62
67
+ - F1: 39.47
68
+
69
+ ## Model description
70
+
71
+ See [google/mt5-base](https://huggingface.co/google/mt5-base) for the model architecture.
72
+ The model was trained on a single NVIDIA RTX 3090 GPU with 24GB of VRAM.
73
+
74
+ ## Intended uses & limitations
75
+
76
+ This model can be used for question generation on German text.
77
+
78
+ ## Training and evaluation data
79
+
80
+ See [lmqg/qg_dequad](https://huggingface.co/datasets/lmqg/qg_dequad).
81
+
82
+ ## Training procedure
83
+
84
+ ### Training hyperparameters
85
+
86
+ The following hyperparameters were used during training:
87
+ - learning_rate: 0.0001
88
+ - train_batch_size: 4
89
+ - eval_batch_size: 4
90
+ - seed: 7
91
+ - gradient_accumulation_steps: 32
92
+ - total_train_batch_size: 128
93
+ - optimizer: Adafactor
94
+ - lr_scheduler_type: constant
95
+ - num_epochs: 20
96
+
97
+ ### Training results
98
+
99
+ | Training Loss | Epoch | Step | Validation Loss | Counts 1 | Counts 2 | Counts 3 | Counts 4 | Totals 1 | Totals 2 | Totals 3 | Totals 4 | Precisions 1 | Precisions 2 | Precisions 3 | Precisions 4 | Brevity Penalty | System Length | Reference Length | ROUGE-1 | ROUGE-2 | ROUGE-L | ROUGE-Lsum | Exact Match | BLEU | Mean Generated Length | F1 |
100
+ |:-------------:|:-----:|:----:|:---------------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:------------:|:------------:|:------------:|:------------:|:---------------:|:-------------:|:----------------:|:-------:|:-------:|:-------:|:----------:|:-----------:|:-------:|:---------------------:|:------:|
101
+ | 6.6905 | 0.99 | 72 | 2.0972 | 5515 | 1394 | 522 | 191 | 28172 | 25968 | 23764 | 21560 | 19.5762 | 5.3681 | 2.1966 | 0.8859 | 1.0 | 28172 | 21250 | 0.1942 | 0.0761 | 0.1837 | 0.1841 | 0.0 | 3.7816 | 11.2786 | 0.2106 |
102
+ | 2.4978 | 1.99 | 145 | 1.6211 | 7079 | 2339 | 1027 | 446 | 16544 | 14340 | 12136 | 9932 | 42.7889 | 16.311 | 8.4624 | 4.4905 | 0.7524 | 16544 | 21250 | 0.3097 | 0.1455 | 0.2971 | 0.2969 | 0.01 | 9.6021 | 12.0159 | 0.3032 |
103
+ | 2.1021 | 3.0 | 218 | 1.5342 | 7507 | 2637 | 1222 | 575 | 17211 | 15007 | 12803 | 10599 | 43.6175 | 17.5718 | 9.5446 | 5.425 | 0.7908 | 17211 | 21250 | 0.3304 | 0.1642 | 0.3172 | 0.3171 | 0.0141 | 11.162 | 12.6375 | 0.3228 |
104
+ | 1.9208 | 4.0 | 291 | 1.4862 | 7599 | 2755 | 1296 | 620 | 16871 | 14667 | 12463 | 10259 | 45.0418 | 18.7837 | 10.3988 | 6.0435 | 0.7714 | 16871 | 21250 | 0.3377 | 0.1721 | 0.3232 | 0.3229 | 0.015 | 11.7136 | 12.3938 | 0.33 |
105
+ | 1.8135 | 4.99 | 363 | 1.4626 | 7831 | 2955 | 1424 | 694 | 17184 | 14980 | 12776 | 10572 | 45.5715 | 19.7263 | 11.1459 | 6.5645 | 0.7893 | 17184 | 21250 | 0.3497 | 0.1837 | 0.3358 | 0.3354 | 0.0177 | 12.6402 | 12.6366 | 0.3417 |
106
+ | 1.6907 | 5.99 | 436 | 1.4392 | 7872 | 3023 | 1482 | 740 | 16907 | 14703 | 12499 | 10295 | 46.5606 | 20.5604 | 11.8569 | 7.188 | 0.7735 | 16907 | 21250 | 0.3566 | 0.1896 | 0.3432 | 0.343 | 0.0177 | 13.0722 | 12.564 | 0.3483 |
107
+ | 1.6159 | 6.99 | 509 | 1.4288 | 7981 | 3128 | 1542 | 773 | 17016 | 14812 | 12608 | 10404 | 46.9029 | 21.118 | 12.2303 | 7.4298 | 0.7797 | 17016 | 21250 | 0.363 | 0.1952 | 0.3504 | 0.3502 | 0.0191 | 13.5053 | 12.5749 | 0.3543 |
108
+ | 1.556 | 8.0 | 582 | 1.4132 | 8014 | 3046 | 1496 | 748 | 17320 | 15116 | 12912 | 10708 | 46.2702 | 20.1508 | 11.5861 | 6.9854 | 0.797 | 17320 | 21250 | 0.3632 | 0.1903 | 0.3489 | 0.3491 | 0.0222 | 13.2095 | 12.7641 | 0.355 |
109
+ | 1.4951 | 9.0 | 655 | 1.3926 | 8342 | 3271 | 1622 | 819 | 17178 | 14974 | 12770 | 10566 | 48.5621 | 21.8445 | 12.7016 | 7.7513 | 0.789 | 17178 | 21250 | 0.3843 | 0.2059 | 0.3704 | 0.3704 | 0.0218 | 14.1831 | 12.7654 | 0.3769 |
110
+ | 1.4522 | 9.99 | 727 | 1.3769 | 8639 | 3449 | 1740 | 891 | 17708 | 15504 | 13300 | 11096 | 48.7859 | 22.2459 | 13.0827 | 8.0299 | 0.8187 | 17708 | 21250 | 0.3972 | 0.2129 | 0.3821 | 0.3823 | 0.024 | 15.0442 | 13.1016 | 0.3895 |
111
+ | 1.3663 | 10.99 | 800 | 1.3677 | 8736 | 3468 | 1747 | 924 | 17674 | 15470 | 13266 | 11062 | 49.4285 | 22.4176 | 13.169 | 8.3529 | 0.8168 | 17674 | 21250 | 0.4027 | 0.215 | 0.3871 | 0.387 | 0.0245 | 15.2622 | 13.0399 | 0.3946 |
112
+ | 1.3122 | 11.99 | 873 | 1.3521 | 8833 | 3533 | 1780 | 915 | 17927 | 15723 | 13519 | 11315 | 49.272 | 22.4703 | 13.1667 | 8.0866 | 0.8308 | 17927 | 21250 | 0.4055 | 0.219 | 0.3915 | 0.3915 | 0.0222 | 15.3943 | 13.3494 | 0.3975 |
113
+ | 1.2641 | 13.0 | 946 | 1.3494 | 9048 | 3668 | 1864 | 989 | 18242 | 16038 | 13834 | 11630 | 49.5998 | 22.8707 | 13.474 | 8.5039 | 0.848 | 18242 | 21250 | 0.4165 | 0.2265 | 0.4011 | 0.401 | 0.0268 | 16.1011 | 13.5508 | 0.408 |
114
+ | 1.2359 | 13.99 | 1018 | 1.3488 | 9075 | 3709 | 1907 | 1013 | 18098 | 15894 | 13690 | 11486 | 50.1437 | 23.3359 | 13.9299 | 8.8194 | 0.8402 | 18098 | 21250 | 0.4195 | 0.2298 | 0.4041 | 0.4038 | 0.0259 | 16.3595 | 13.5681 | 0.4113 |
115
+ | 1.1754 | 14.99 | 1091 | 1.3482 | 9182 | 3777 | 1957 | 1048 | 18366 | 16162 | 13958 | 11754 | 49.9946 | 23.3696 | 14.0206 | 8.9161 | 0.8547 | 18366 | 21250 | 0.4227 | 0.2314 | 0.406 | 0.4058 | 0.0268 | 16.7083 | 13.6534 | 0.4145 |
116
+ | 1.1367 | 15.99 | 1164 | 1.3501 | 9164 | 3761 | 1935 | 1033 | 18310 | 16106 | 13902 | 11698 | 50.0492 | 23.3515 | 13.9189 | 8.8306 | 0.8517 | 18310 | 21250 | 0.4225 | 0.2316 | 0.4078 | 0.4079 | 0.0245 | 16.5803 | 13.6152 | 0.4147 |
117
+ | 1.096 | 17.0 | 1237 | 1.3586 | 9126 | 3712 | 1922 | 1050 | 18277 | 16073 | 13869 | 11665 | 49.9316 | 23.0946 | 13.8582 | 9.0013 | 0.8499 | 18277 | 21250 | 0.4217 | 0.2304 | 0.4066 | 0.4066 | 0.0295 | 16.5513 | 13.6325 | 0.4141 |
118
+ | 1.0571 | 18.0 | 1310 | 1.3658 | 9087 | 3707 | 1923 | 1033 | 18179 | 15975 | 13771 | 11567 | 49.9862 | 23.205 | 13.9641 | 8.9306 | 0.8446 | 18179 | 21250 | 0.4196 | 0.2301 | 0.4049 | 0.4049 | 0.029 | 16.4708 | 13.5172 | 0.4116 |
119
+ | 1.036 | 18.99 | 1382 | 1.3672 | 9206 | 3806 | 1976 | 1059 | 18332 | 16128 | 13924 | 11720 | 50.2182 | 23.5987 | 14.1913 | 9.0358 | 0.8528 | 18332 | 21250 | 0.4254 | 0.2348 | 0.4106 | 0.4107 | 0.0309 | 16.8386 | 13.7205 | 0.4174 |
120
+ | 0.9785 | 19.79 | 1440 | 1.3819 | 9180 | 3796 | 1973 | 1059 | 18164 | 15960 | 13756 | 11552 | 50.5395 | 23.7845 | 14.3428 | 9.1672 | 0.8438 | 18164 | 21250 | 0.4254 | 0.2344 | 0.4116 | 0.4117 | 0.0327 | 16.8234 | 13.5113 | 0.4172 |
121
+
122
+
123
+ ### Framework versions
124
+
125
+ - Transformers 4.32.1
126
+ - Pytorch 2.1.0
127
+ - Datasets 2.12.0
128
+ - Tokenizers 0.13.3
added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<hl>": 250100
3
+ }
all_results.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 19.79,
3
+ "eval_bleu": 16.8234,
4
+ "eval_bp": 0.8438,
5
+ "eval_counts_1": 9180,
6
+ "eval_counts_2": 3796,
7
+ "eval_counts_3": 1973,
8
+ "eval_counts_4": 1059,
9
+ "eval_exact_match": 0.0327,
10
+ "eval_f1": 0.4172,
11
+ "eval_gen_len": 13.5113,
12
+ "eval_loss": 1.381914496421814,
13
+ "eval_precisions_1": 50.5395,
14
+ "eval_precisions_2": 23.7845,
15
+ "eval_precisions_3": 14.3428,
16
+ "eval_precisions_4": 9.1672,
17
+ "eval_ref_len": 21250,
18
+ "eval_rouge1": 0.4254,
19
+ "eval_rouge2": 0.2344,
20
+ "eval_rougeL": 0.4116,
21
+ "eval_rougeLsum": 0.4117,
22
+ "eval_runtime": 466.4651,
23
+ "eval_samples": 2204,
24
+ "eval_samples_per_second": 4.725,
25
+ "eval_steps_per_second": 1.181,
26
+ "eval_sys_len": 18164,
27
+ "eval_totals_1": 18164,
28
+ "eval_totals_2": 15960,
29
+ "eval_totals_3": 13756,
30
+ "eval_totals_4": 11552,
31
+ "predict_bleu": 14.2634,
32
+ "predict_bp": 0.8719,
33
+ "predict_counts_1": 8549,
34
+ "predict_counts_2": 3243,
35
+ "predict_counts_3": 1568,
36
+ "predict_counts_4": 785,
37
+ "predict_exact_match": 0.0254,
38
+ "predict_f1": 0.385,
39
+ "predict_gen_len": 13.7709,
40
+ "predict_loss": 1.6239880323410034,
41
+ "predict_precisions_1": 46.7491,
42
+ "predict_precisions_2": 20.1641,
43
+ "predict_precisions_3": 11.2976,
44
+ "predict_precisions_4": 6.7238,
45
+ "predict_ref_len": 20793,
46
+ "predict_rouge1": 0.3945,
47
+ "predict_rouge2": 0.2094,
48
+ "predict_rougeL": 0.381,
49
+ "predict_rougeLsum": 0.381,
50
+ "predict_runtime": 471.1259,
51
+ "predict_samples": 2204,
52
+ "predict_samples_per_second": 4.678,
53
+ "predict_steps_per_second": 1.17,
54
+ "predict_sys_len": 18287,
55
+ "predict_totals_1": 18287,
56
+ "predict_totals_2": 16083,
57
+ "predict_totals_3": 13879,
58
+ "predict_totals_4": 11675,
59
+ "train_loss": 1.7299000342686972,
60
+ "train_runtime": 27815.7883,
61
+ "train_samples": 9314,
62
+ "train_samples_per_second": 6.697,
63
+ "train_steps_per_second": 0.052
64
+ }
config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/mt5-base",
3
+ "architectures": [
4
+ "MT5ForConditionalGeneration"
5
+ ],
6
+ "classifier_dropout": 0.0,
7
+ "d_ff": 2048,
8
+ "d_kv": 64,
9
+ "d_model": 768,
10
+ "decoder_start_token_id": 0,
11
+ "dense_act_fn": "gelu_new",
12
+ "dropout_rate": 0.1,
13
+ "eos_token_id": 1,
14
+ "feed_forward_proj": "gated-gelu",
15
+ "initializer_factor": 1.0,
16
+ "is_encoder_decoder": true,
17
+ "is_gated_act": true,
18
+ "layer_norm_epsilon": 1e-06,
19
+ "length_penalty": 0.0,
20
+ "max_length": 64,
21
+ "model_type": "mt5",
22
+ "num_beams": 4,
23
+ "num_decoder_layers": 12,
24
+ "num_heads": 12,
25
+ "num_layers": 12,
26
+ "output_past": true,
27
+ "pad_token_id": 0,
28
+ "relative_attention_max_distance": 128,
29
+ "relative_attention_num_buckets": 32,
30
+ "tie_word_embeddings": false,
31
+ "tokenizer_class": "T5Tokenizer",
32
+ "torch_dtype": "float32",
33
+ "transformers_version": "4.32.1",
34
+ "use_cache": true,
35
+ "vocab_size": 250112
36
+ }
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "decoder_start_token_id": 0,
3
+ "eos_token_id": 1,
4
+ "length_penalty": 0.0,
5
+ "max_length": 64,
6
+ "num_beams": 4,
7
+ "pad_token_id": 0,
8
+ "transformers_version": "4.32.1"
9
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4669bdb3bbc4ba7d4c86e12b0521186d42c641865f011190d851ce6fd5f0fe98
3
+ size 2329638768
special_tokens_map.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<hl>"
4
+ ],
5
+ "eos_token": "</s>",
6
+ "pad_token": "<pad>",
7
+ "unk_token": "<unk>"
8
+ }
spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef78f86560d809067d12bac6c09f19a462cb3af3f54d2b8acbba26e1433125d6
3
+ size 4309802
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c58c3dc929366af7c460d31895a225edc5077f5fb4438735a9896a78ab9842d7
3
+ size 16330813
tokenizer_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": null,
3
+ "clean_up_tokenization_spaces": true,
4
+ "eos_token": "</s>",
5
+ "extra_ids": 0,
6
+ "legacy": true,
7
+ "model_max_length": 1000000000000000019884624838656,
8
+ "pad_token": "<pad>",
9
+ "sp_model_kwargs": {},
10
+ "tokenizer_class": "T5Tokenizer",
11
+ "unk_token": "<unk>"
12
+ }
trainer_state.json ADDED
@@ -0,0 +1,768 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 19.78531558608845,
5
+ "eval_steps": 500,
6
+ "global_step": 1440,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.99,
13
+ "learning_rate": 0.0001,
14
+ "loss": 6.6905,
15
+ "step": 72
16
+ },
17
+ {
18
+ "epoch": 0.99,
19
+ "eval_bleu": 3.7816,
20
+ "eval_bp": 1.0,
21
+ "eval_counts_1": 5515,
22
+ "eval_counts_2": 1394,
23
+ "eval_counts_3": 522,
24
+ "eval_counts_4": 191,
25
+ "eval_exact_match": 0.0,
26
+ "eval_f1": 0.2106,
27
+ "eval_gen_len": 11.2786,
28
+ "eval_loss": 2.097219705581665,
29
+ "eval_precisions_1": 19.5762,
30
+ "eval_precisions_2": 5.3681,
31
+ "eval_precisions_3": 2.1966,
32
+ "eval_precisions_4": 0.8859,
33
+ "eval_ref_len": 21250,
34
+ "eval_rouge1": 0.1942,
35
+ "eval_rouge2": 0.0761,
36
+ "eval_rougeL": 0.1837,
37
+ "eval_rougeLsum": 0.1841,
38
+ "eval_runtime": 456.7865,
39
+ "eval_samples_per_second": 4.825,
40
+ "eval_steps_per_second": 1.206,
41
+ "eval_sys_len": 28172,
42
+ "eval_totals_1": 28172,
43
+ "eval_totals_2": 25968,
44
+ "eval_totals_3": 23764,
45
+ "eval_totals_4": 21560,
46
+ "step": 72
47
+ },
48
+ {
49
+ "epoch": 1.99,
50
+ "learning_rate": 0.0001,
51
+ "loss": 2.4978,
52
+ "step": 145
53
+ },
54
+ {
55
+ "epoch": 1.99,
56
+ "eval_bleu": 9.6021,
57
+ "eval_bp": 0.7524,
58
+ "eval_counts_1": 7079,
59
+ "eval_counts_2": 2339,
60
+ "eval_counts_3": 1027,
61
+ "eval_counts_4": 446,
62
+ "eval_exact_match": 0.01,
63
+ "eval_f1": 0.3032,
64
+ "eval_gen_len": 12.0159,
65
+ "eval_loss": 1.6211049556732178,
66
+ "eval_precisions_1": 42.7889,
67
+ "eval_precisions_2": 16.311,
68
+ "eval_precisions_3": 8.4624,
69
+ "eval_precisions_4": 4.4905,
70
+ "eval_ref_len": 21250,
71
+ "eval_rouge1": 0.3097,
72
+ "eval_rouge2": 0.1455,
73
+ "eval_rougeL": 0.2971,
74
+ "eval_rougeLsum": 0.2969,
75
+ "eval_runtime": 435.2772,
76
+ "eval_samples_per_second": 5.063,
77
+ "eval_steps_per_second": 1.266,
78
+ "eval_sys_len": 16544,
79
+ "eval_totals_1": 16544,
80
+ "eval_totals_2": 14340,
81
+ "eval_totals_3": 12136,
82
+ "eval_totals_4": 9932,
83
+ "step": 145
84
+ },
85
+ {
86
+ "epoch": 3.0,
87
+ "learning_rate": 0.0001,
88
+ "loss": 2.1021,
89
+ "step": 218
90
+ },
91
+ {
92
+ "epoch": 3.0,
93
+ "eval_bleu": 11.162,
94
+ "eval_bp": 0.7908,
95
+ "eval_counts_1": 7507,
96
+ "eval_counts_2": 2637,
97
+ "eval_counts_3": 1222,
98
+ "eval_counts_4": 575,
99
+ "eval_exact_match": 0.0141,
100
+ "eval_f1": 0.3228,
101
+ "eval_gen_len": 12.6375,
102
+ "eval_loss": 1.5342339277267456,
103
+ "eval_precisions_1": 43.6175,
104
+ "eval_precisions_2": 17.5718,
105
+ "eval_precisions_3": 9.5446,
106
+ "eval_precisions_4": 5.425,
107
+ "eval_ref_len": 21250,
108
+ "eval_rouge1": 0.3304,
109
+ "eval_rouge2": 0.1642,
110
+ "eval_rougeL": 0.3172,
111
+ "eval_rougeLsum": 0.3171,
112
+ "eval_runtime": 446.8682,
113
+ "eval_samples_per_second": 4.932,
114
+ "eval_steps_per_second": 1.233,
115
+ "eval_sys_len": 17211,
116
+ "eval_totals_1": 17211,
117
+ "eval_totals_2": 15007,
118
+ "eval_totals_3": 12803,
119
+ "eval_totals_4": 10599,
120
+ "step": 218
121
+ },
122
+ {
123
+ "epoch": 4.0,
124
+ "learning_rate": 0.0001,
125
+ "loss": 1.9208,
126
+ "step": 291
127
+ },
128
+ {
129
+ "epoch": 4.0,
130
+ "eval_bleu": 11.7136,
131
+ "eval_bp": 0.7714,
132
+ "eval_counts_1": 7599,
133
+ "eval_counts_2": 2755,
134
+ "eval_counts_3": 1296,
135
+ "eval_counts_4": 620,
136
+ "eval_exact_match": 0.015,
137
+ "eval_f1": 0.33,
138
+ "eval_gen_len": 12.3938,
139
+ "eval_loss": 1.4861969947814941,
140
+ "eval_precisions_1": 45.0418,
141
+ "eval_precisions_2": 18.7837,
142
+ "eval_precisions_3": 10.3988,
143
+ "eval_precisions_4": 6.0435,
144
+ "eval_ref_len": 21250,
145
+ "eval_rouge1": 0.3377,
146
+ "eval_rouge2": 0.1721,
147
+ "eval_rougeL": 0.3232,
148
+ "eval_rougeLsum": 0.3229,
149
+ "eval_runtime": 440.9926,
150
+ "eval_samples_per_second": 4.998,
151
+ "eval_steps_per_second": 1.249,
152
+ "eval_sys_len": 16871,
153
+ "eval_totals_1": 16871,
154
+ "eval_totals_2": 14667,
155
+ "eval_totals_3": 12463,
156
+ "eval_totals_4": 10259,
157
+ "step": 291
158
+ },
159
+ {
160
+ "epoch": 4.99,
161
+ "learning_rate": 0.0001,
162
+ "loss": 1.8135,
163
+ "step": 363
164
+ },
165
+ {
166
+ "epoch": 4.99,
167
+ "eval_bleu": 12.6402,
168
+ "eval_bp": 0.7893,
169
+ "eval_counts_1": 7831,
170
+ "eval_counts_2": 2955,
171
+ "eval_counts_3": 1424,
172
+ "eval_counts_4": 694,
173
+ "eval_exact_match": 0.0177,
174
+ "eval_f1": 0.3417,
175
+ "eval_gen_len": 12.6366,
176
+ "eval_loss": 1.4626398086547852,
177
+ "eval_precisions_1": 45.5715,
178
+ "eval_precisions_2": 19.7263,
179
+ "eval_precisions_3": 11.1459,
180
+ "eval_precisions_4": 6.5645,
181
+ "eval_ref_len": 21250,
182
+ "eval_rouge1": 0.3497,
183
+ "eval_rouge2": 0.1837,
184
+ "eval_rougeL": 0.3358,
185
+ "eval_rougeLsum": 0.3354,
186
+ "eval_runtime": 448.9344,
187
+ "eval_samples_per_second": 4.909,
188
+ "eval_steps_per_second": 1.227,
189
+ "eval_sys_len": 17184,
190
+ "eval_totals_1": 17184,
191
+ "eval_totals_2": 14980,
192
+ "eval_totals_3": 12776,
193
+ "eval_totals_4": 10572,
194
+ "step": 363
195
+ },
196
+ {
197
+ "epoch": 5.99,
198
+ "learning_rate": 0.0001,
199
+ "loss": 1.6907,
200
+ "step": 436
201
+ },
202
+ {
203
+ "epoch": 5.99,
204
+ "eval_bleu": 13.0722,
205
+ "eval_bp": 0.7735,
206
+ "eval_counts_1": 7872,
207
+ "eval_counts_2": 3023,
208
+ "eval_counts_3": 1482,
209
+ "eval_counts_4": 740,
210
+ "eval_exact_match": 0.0177,
211
+ "eval_f1": 0.3483,
212
+ "eval_gen_len": 12.564,
213
+ "eval_loss": 1.439197301864624,
214
+ "eval_precisions_1": 46.5606,
215
+ "eval_precisions_2": 20.5604,
216
+ "eval_precisions_3": 11.8569,
217
+ "eval_precisions_4": 7.188,
218
+ "eval_ref_len": 21250,
219
+ "eval_rouge1": 0.3566,
220
+ "eval_rouge2": 0.1896,
221
+ "eval_rougeL": 0.3432,
222
+ "eval_rougeLsum": 0.343,
223
+ "eval_runtime": 718.6776,
224
+ "eval_samples_per_second": 3.067,
225
+ "eval_steps_per_second": 0.767,
226
+ "eval_sys_len": 16907,
227
+ "eval_totals_1": 16907,
228
+ "eval_totals_2": 14703,
229
+ "eval_totals_3": 12499,
230
+ "eval_totals_4": 10295,
231
+ "step": 436
232
+ },
233
+ {
234
+ "epoch": 6.99,
235
+ "learning_rate": 0.0001,
236
+ "loss": 1.6159,
237
+ "step": 509
238
+ },
239
+ {
240
+ "epoch": 6.99,
241
+ "eval_bleu": 13.5053,
242
+ "eval_bp": 0.7797,
243
+ "eval_counts_1": 7981,
244
+ "eval_counts_2": 3128,
245
+ "eval_counts_3": 1542,
246
+ "eval_counts_4": 773,
247
+ "eval_exact_match": 0.0191,
248
+ "eval_f1": 0.3543,
249
+ "eval_gen_len": 12.5749,
250
+ "eval_loss": 1.4288065433502197,
251
+ "eval_precisions_1": 46.9029,
252
+ "eval_precisions_2": 21.118,
253
+ "eval_precisions_3": 12.2303,
254
+ "eval_precisions_4": 7.4298,
255
+ "eval_ref_len": 21250,
256
+ "eval_rouge1": 0.363,
257
+ "eval_rouge2": 0.1952,
258
+ "eval_rougeL": 0.3504,
259
+ "eval_rougeLsum": 0.3502,
260
+ "eval_runtime": 709.1881,
261
+ "eval_samples_per_second": 3.108,
262
+ "eval_steps_per_second": 0.777,
263
+ "eval_sys_len": 17016,
264
+ "eval_totals_1": 17016,
265
+ "eval_totals_2": 14812,
266
+ "eval_totals_3": 12608,
267
+ "eval_totals_4": 10404,
268
+ "step": 509
269
+ },
270
+ {
271
+ "epoch": 8.0,
272
+ "learning_rate": 0.0001,
273
+ "loss": 1.556,
274
+ "step": 582
275
+ },
276
+ {
277
+ "epoch": 8.0,
278
+ "eval_bleu": 13.2095,
279
+ "eval_bp": 0.797,
280
+ "eval_counts_1": 8014,
281
+ "eval_counts_2": 3046,
282
+ "eval_counts_3": 1496,
283
+ "eval_counts_4": 748,
284
+ "eval_exact_match": 0.0222,
285
+ "eval_f1": 0.355,
286
+ "eval_gen_len": 12.7641,
287
+ "eval_loss": 1.4131838083267212,
288
+ "eval_precisions_1": 46.2702,
289
+ "eval_precisions_2": 20.1508,
290
+ "eval_precisions_3": 11.5861,
291
+ "eval_precisions_4": 6.9854,
292
+ "eval_ref_len": 21250,
293
+ "eval_rouge1": 0.3632,
294
+ "eval_rouge2": 0.1903,
295
+ "eval_rougeL": 0.3489,
296
+ "eval_rougeLsum": 0.3491,
297
+ "eval_runtime": 736.4055,
298
+ "eval_samples_per_second": 2.993,
299
+ "eval_steps_per_second": 0.748,
300
+ "eval_sys_len": 17320,
301
+ "eval_totals_1": 17320,
302
+ "eval_totals_2": 15116,
303
+ "eval_totals_3": 12912,
304
+ "eval_totals_4": 10708,
305
+ "step": 582
306
+ },
307
+ {
308
+ "epoch": 9.0,
309
+ "learning_rate": 0.0001,
310
+ "loss": 1.4951,
311
+ "step": 655
312
+ },
313
+ {
314
+ "epoch": 9.0,
315
+ "eval_bleu": 14.1831,
316
+ "eval_bp": 0.789,
317
+ "eval_counts_1": 8342,
318
+ "eval_counts_2": 3271,
319
+ "eval_counts_3": 1622,
320
+ "eval_counts_4": 819,
321
+ "eval_exact_match": 0.0218,
322
+ "eval_f1": 0.3769,
323
+ "eval_gen_len": 12.7654,
324
+ "eval_loss": 1.3926042318344116,
325
+ "eval_precisions_1": 48.5621,
326
+ "eval_precisions_2": 21.8445,
327
+ "eval_precisions_3": 12.7016,
328
+ "eval_precisions_4": 7.7513,
329
+ "eval_ref_len": 21250,
330
+ "eval_rouge1": 0.3843,
331
+ "eval_rouge2": 0.2059,
332
+ "eval_rougeL": 0.3704,
333
+ "eval_rougeLsum": 0.3704,
334
+ "eval_runtime": 695.8554,
335
+ "eval_samples_per_second": 3.167,
336
+ "eval_steps_per_second": 0.792,
337
+ "eval_sys_len": 17178,
338
+ "eval_totals_1": 17178,
339
+ "eval_totals_2": 14974,
340
+ "eval_totals_3": 12770,
341
+ "eval_totals_4": 10566,
342
+ "step": 655
343
+ },
344
+ {
345
+ "epoch": 9.99,
346
+ "learning_rate": 0.0001,
347
+ "loss": 1.4522,
348
+ "step": 727
349
+ },
350
+ {
351
+ "epoch": 9.99,
352
+ "eval_bleu": 15.0442,
353
+ "eval_bp": 0.8187,
354
+ "eval_counts_1": 8639,
355
+ "eval_counts_2": 3449,
356
+ "eval_counts_3": 1740,
357
+ "eval_counts_4": 891,
358
+ "eval_exact_match": 0.024,
359
+ "eval_f1": 0.3895,
360
+ "eval_gen_len": 13.1016,
361
+ "eval_loss": 1.3769304752349854,
362
+ "eval_precisions_1": 48.7859,
363
+ "eval_precisions_2": 22.2459,
364
+ "eval_precisions_3": 13.0827,
365
+ "eval_precisions_4": 8.0299,
366
+ "eval_ref_len": 21250,
367
+ "eval_rouge1": 0.3972,
368
+ "eval_rouge2": 0.2129,
369
+ "eval_rougeL": 0.3821,
370
+ "eval_rougeLsum": 0.3823,
371
+ "eval_runtime": 733.5109,
372
+ "eval_samples_per_second": 3.005,
373
+ "eval_steps_per_second": 0.751,
374
+ "eval_sys_len": 17708,
375
+ "eval_totals_1": 17708,
376
+ "eval_totals_2": 15504,
377
+ "eval_totals_3": 13300,
378
+ "eval_totals_4": 11096,
379
+ "step": 727
380
+ },
381
+ {
382
+ "epoch": 10.99,
383
+ "learning_rate": 0.0001,
384
+ "loss": 1.3663,
385
+ "step": 800
386
+ },
387
+ {
388
+ "epoch": 10.99,
389
+ "eval_bleu": 15.2622,
390
+ "eval_bp": 0.8168,
391
+ "eval_counts_1": 8736,
392
+ "eval_counts_2": 3468,
393
+ "eval_counts_3": 1747,
394
+ "eval_counts_4": 924,
395
+ "eval_exact_match": 0.0245,
396
+ "eval_f1": 0.3946,
397
+ "eval_gen_len": 13.0399,
398
+ "eval_loss": 1.3676577806472778,
399
+ "eval_precisions_1": 49.4285,
400
+ "eval_precisions_2": 22.4176,
401
+ "eval_precisions_3": 13.169,
402
+ "eval_precisions_4": 8.3529,
403
+ "eval_ref_len": 21250,
404
+ "eval_rouge1": 0.4027,
405
+ "eval_rouge2": 0.215,
406
+ "eval_rougeL": 0.3871,
407
+ "eval_rougeLsum": 0.387,
408
+ "eval_runtime": 746.3261,
409
+ "eval_samples_per_second": 2.953,
410
+ "eval_steps_per_second": 0.738,
411
+ "eval_sys_len": 17674,
412
+ "eval_totals_1": 17674,
413
+ "eval_totals_2": 15470,
414
+ "eval_totals_3": 13266,
415
+ "eval_totals_4": 11062,
416
+ "step": 800
417
+ },
418
+ {
419
+ "epoch": 11.99,
420
+ "learning_rate": 0.0001,
421
+ "loss": 1.3122,
422
+ "step": 873
423
+ },
424
+ {
425
+ "epoch": 11.99,
426
+ "eval_bleu": 15.3943,
427
+ "eval_bp": 0.8308,
428
+ "eval_counts_1": 8833,
429
+ "eval_counts_2": 3533,
430
+ "eval_counts_3": 1780,
431
+ "eval_counts_4": 915,
432
+ "eval_exact_match": 0.0222,
433
+ "eval_f1": 0.3975,
434
+ "eval_gen_len": 13.3494,
435
+ "eval_loss": 1.352068305015564,
436
+ "eval_precisions_1": 49.272,
437
+ "eval_precisions_2": 22.4703,
438
+ "eval_precisions_3": 13.1667,
439
+ "eval_precisions_4": 8.0866,
440
+ "eval_ref_len": 21250,
441
+ "eval_rouge1": 0.4055,
442
+ "eval_rouge2": 0.219,
443
+ "eval_rougeL": 0.3915,
444
+ "eval_rougeLsum": 0.3915,
445
+ "eval_runtime": 815.025,
446
+ "eval_samples_per_second": 2.704,
447
+ "eval_steps_per_second": 0.676,
448
+ "eval_sys_len": 17927,
449
+ "eval_totals_1": 17927,
450
+ "eval_totals_2": 15723,
451
+ "eval_totals_3": 13519,
452
+ "eval_totals_4": 11315,
453
+ "step": 873
454
+ },
455
+ {
456
+ "epoch": 13.0,
457
+ "learning_rate": 0.0001,
458
+ "loss": 1.2641,
459
+ "step": 946
460
+ },
461
+ {
462
+ "epoch": 13.0,
463
+ "eval_bleu": 16.1011,
464
+ "eval_bp": 0.848,
465
+ "eval_counts_1": 9048,
466
+ "eval_counts_2": 3668,
467
+ "eval_counts_3": 1864,
468
+ "eval_counts_4": 989,
469
+ "eval_exact_match": 0.0268,
470
+ "eval_f1": 0.408,
471
+ "eval_gen_len": 13.5508,
472
+ "eval_loss": 1.3493599891662598,
473
+ "eval_precisions_1": 49.5998,
474
+ "eval_precisions_2": 22.8707,
475
+ "eval_precisions_3": 13.474,
476
+ "eval_precisions_4": 8.5039,
477
+ "eval_ref_len": 21250,
478
+ "eval_rouge1": 0.4165,
479
+ "eval_rouge2": 0.2265,
480
+ "eval_rougeL": 0.4011,
481
+ "eval_rougeLsum": 0.401,
482
+ "eval_runtime": 726.7867,
483
+ "eval_samples_per_second": 3.033,
484
+ "eval_steps_per_second": 0.758,
485
+ "eval_sys_len": 18242,
486
+ "eval_totals_1": 18242,
487
+ "eval_totals_2": 16038,
488
+ "eval_totals_3": 13834,
489
+ "eval_totals_4": 11630,
490
+ "step": 946
491
+ },
492
+ {
493
+ "epoch": 13.99,
494
+ "learning_rate": 0.0001,
495
+ "loss": 1.2359,
496
+ "step": 1018
497
+ },
498
+ {
499
+ "epoch": 13.99,
500
+ "eval_bleu": 16.3595,
501
+ "eval_bp": 0.8402,
502
+ "eval_counts_1": 9075,
503
+ "eval_counts_2": 3709,
504
+ "eval_counts_3": 1907,
505
+ "eval_counts_4": 1013,
506
+ "eval_exact_match": 0.0259,
507
+ "eval_f1": 0.4113,
508
+ "eval_gen_len": 13.5681,
509
+ "eval_loss": 1.3488041162490845,
510
+ "eval_precisions_1": 50.1437,
511
+ "eval_precisions_2": 23.3359,
512
+ "eval_precisions_3": 13.9299,
513
+ "eval_precisions_4": 8.8194,
514
+ "eval_ref_len": 21250,
515
+ "eval_rouge1": 0.4195,
516
+ "eval_rouge2": 0.2298,
517
+ "eval_rougeL": 0.4041,
518
+ "eval_rougeLsum": 0.4038,
519
+ "eval_runtime": 701.8557,
520
+ "eval_samples_per_second": 3.14,
521
+ "eval_steps_per_second": 0.785,
522
+ "eval_sys_len": 18098,
523
+ "eval_totals_1": 18098,
524
+ "eval_totals_2": 15894,
525
+ "eval_totals_3": 13690,
526
+ "eval_totals_4": 11486,
527
+ "step": 1018
528
+ },
529
+ {
530
+ "epoch": 14.99,
531
+ "learning_rate": 0.0001,
532
+ "loss": 1.1754,
533
+ "step": 1091
534
+ },
535
+ {
536
+ "epoch": 14.99,
537
+ "eval_bleu": 16.7083,
538
+ "eval_bp": 0.8547,
539
+ "eval_counts_1": 9182,
540
+ "eval_counts_2": 3777,
541
+ "eval_counts_3": 1957,
542
+ "eval_counts_4": 1048,
543
+ "eval_exact_match": 0.0268,
544
+ "eval_f1": 0.4145,
545
+ "eval_gen_len": 13.6534,
546
+ "eval_loss": 1.3482075929641724,
547
+ "eval_precisions_1": 49.9946,
548
+ "eval_precisions_2": 23.3696,
549
+ "eval_precisions_3": 14.0206,
550
+ "eval_precisions_4": 8.9161,
551
+ "eval_ref_len": 21250,
552
+ "eval_rouge1": 0.4227,
553
+ "eval_rouge2": 0.2314,
554
+ "eval_rougeL": 0.406,
555
+ "eval_rougeLsum": 0.4058,
556
+ "eval_runtime": 469.6435,
557
+ "eval_samples_per_second": 4.693,
558
+ "eval_steps_per_second": 1.173,
559
+ "eval_sys_len": 18366,
560
+ "eval_totals_1": 18366,
561
+ "eval_totals_2": 16162,
562
+ "eval_totals_3": 13958,
563
+ "eval_totals_4": 11754,
564
+ "step": 1091
565
+ },
566
+ {
567
+ "epoch": 15.99,
568
+ "learning_rate": 0.0001,
569
+ "loss": 1.1367,
570
+ "step": 1164
571
+ },
572
+ {
573
+ "epoch": 15.99,
574
+ "eval_bleu": 16.5803,
575
+ "eval_bp": 0.8517,
576
+ "eval_counts_1": 9164,
577
+ "eval_counts_2": 3761,
578
+ "eval_counts_3": 1935,
579
+ "eval_counts_4": 1033,
580
+ "eval_exact_match": 0.0245,
581
+ "eval_f1": 0.4147,
582
+ "eval_gen_len": 13.6152,
583
+ "eval_loss": 1.3501369953155518,
584
+ "eval_precisions_1": 50.0492,
585
+ "eval_precisions_2": 23.3515,
586
+ "eval_precisions_3": 13.9189,
587
+ "eval_precisions_4": 8.8306,
588
+ "eval_ref_len": 21250,
589
+ "eval_rouge1": 0.4225,
590
+ "eval_rouge2": 0.2316,
591
+ "eval_rougeL": 0.4078,
592
+ "eval_rougeLsum": 0.4079,
593
+ "eval_runtime": 480.2308,
594
+ "eval_samples_per_second": 4.589,
595
+ "eval_steps_per_second": 1.147,
596
+ "eval_sys_len": 18310,
597
+ "eval_totals_1": 18310,
598
+ "eval_totals_2": 16106,
599
+ "eval_totals_3": 13902,
600
+ "eval_totals_4": 11698,
601
+ "step": 1164
602
+ },
603
+ {
604
+ "epoch": 17.0,
605
+ "learning_rate": 0.0001,
606
+ "loss": 1.096,
607
+ "step": 1237
608
+ },
609
+ {
610
+ "epoch": 17.0,
611
+ "eval_bleu": 16.5513,
612
+ "eval_bp": 0.8499,
613
+ "eval_counts_1": 9126,
614
+ "eval_counts_2": 3712,
615
+ "eval_counts_3": 1922,
616
+ "eval_counts_4": 1050,
617
+ "eval_exact_match": 0.0295,
618
+ "eval_f1": 0.4141,
619
+ "eval_gen_len": 13.6325,
620
+ "eval_loss": 1.358604907989502,
621
+ "eval_precisions_1": 49.9316,
622
+ "eval_precisions_2": 23.0946,
623
+ "eval_precisions_3": 13.8582,
624
+ "eval_precisions_4": 9.0013,
625
+ "eval_ref_len": 21250,
626
+ "eval_rouge1": 0.4217,
627
+ "eval_rouge2": 0.2304,
628
+ "eval_rougeL": 0.4066,
629
+ "eval_rougeLsum": 0.4066,
630
+ "eval_runtime": 465.7019,
631
+ "eval_samples_per_second": 4.733,
632
+ "eval_steps_per_second": 1.183,
633
+ "eval_sys_len": 18277,
634
+ "eval_totals_1": 18277,
635
+ "eval_totals_2": 16073,
636
+ "eval_totals_3": 13869,
637
+ "eval_totals_4": 11665,
638
+ "step": 1237
639
+ },
640
+ {
641
+ "epoch": 18.0,
642
+ "learning_rate": 0.0001,
643
+ "loss": 1.0571,
644
+ "step": 1310
645
+ },
646
+ {
647
+ "epoch": 18.0,
648
+ "eval_bleu": 16.4708,
649
+ "eval_bp": 0.8446,
650
+ "eval_counts_1": 9087,
651
+ "eval_counts_2": 3707,
652
+ "eval_counts_3": 1923,
653
+ "eval_counts_4": 1033,
654
+ "eval_exact_match": 0.029,
655
+ "eval_f1": 0.4116,
656
+ "eval_gen_len": 13.5172,
657
+ "eval_loss": 1.3658462762832642,
658
+ "eval_precisions_1": 49.9862,
659
+ "eval_precisions_2": 23.205,
660
+ "eval_precisions_3": 13.9641,
661
+ "eval_precisions_4": 8.9306,
662
+ "eval_ref_len": 21250,
663
+ "eval_rouge1": 0.4196,
664
+ "eval_rouge2": 0.2301,
665
+ "eval_rougeL": 0.4049,
666
+ "eval_rougeLsum": 0.4049,
667
+ "eval_runtime": 463.8447,
668
+ "eval_samples_per_second": 4.752,
669
+ "eval_steps_per_second": 1.188,
670
+ "eval_sys_len": 18179,
671
+ "eval_totals_1": 18179,
672
+ "eval_totals_2": 15975,
673
+ "eval_totals_3": 13771,
674
+ "eval_totals_4": 11567,
675
+ "step": 1310
676
+ },
677
+ {
678
+ "epoch": 18.99,
679
+ "learning_rate": 0.0001,
680
+ "loss": 1.036,
681
+ "step": 1382
682
+ },
683
+ {
684
+ "epoch": 18.99,
685
+ "eval_bleu": 16.8386,
686
+ "eval_bp": 0.8528,
687
+ "eval_counts_1": 9206,
688
+ "eval_counts_2": 3806,
689
+ "eval_counts_3": 1976,
690
+ "eval_counts_4": 1059,
691
+ "eval_exact_match": 0.0309,
692
+ "eval_f1": 0.4174,
693
+ "eval_gen_len": 13.7205,
694
+ "eval_loss": 1.367233395576477,
695
+ "eval_precisions_1": 50.2182,
696
+ "eval_precisions_2": 23.5987,
697
+ "eval_precisions_3": 14.1913,
698
+ "eval_precisions_4": 9.0358,
699
+ "eval_ref_len": 21250,
700
+ "eval_rouge1": 0.4254,
701
+ "eval_rouge2": 0.2348,
702
+ "eval_rougeL": 0.4106,
703
+ "eval_rougeLsum": 0.4107,
704
+ "eval_runtime": 489.8628,
705
+ "eval_samples_per_second": 4.499,
706
+ "eval_steps_per_second": 1.125,
707
+ "eval_sys_len": 18332,
708
+ "eval_totals_1": 18332,
709
+ "eval_totals_2": 16128,
710
+ "eval_totals_3": 13924,
711
+ "eval_totals_4": 11720,
712
+ "step": 1382
713
+ },
714
+ {
715
+ "epoch": 19.79,
716
+ "learning_rate": 0.0001,
717
+ "loss": 0.9785,
718
+ "step": 1440
719
+ },
720
+ {
721
+ "epoch": 19.79,
722
+ "eval_bleu": 16.8234,
723
+ "eval_bp": 0.8438,
724
+ "eval_counts_1": 9180,
725
+ "eval_counts_2": 3796,
726
+ "eval_counts_3": 1973,
727
+ "eval_counts_4": 1059,
728
+ "eval_exact_match": 0.0327,
729
+ "eval_f1": 0.4172,
730
+ "eval_gen_len": 13.5113,
731
+ "eval_loss": 1.381914496421814,
732
+ "eval_precisions_1": 50.5395,
733
+ "eval_precisions_2": 23.7845,
734
+ "eval_precisions_3": 14.3428,
735
+ "eval_precisions_4": 9.1672,
736
+ "eval_ref_len": 21250,
737
+ "eval_rouge1": 0.4254,
738
+ "eval_rouge2": 0.2344,
739
+ "eval_rougeL": 0.4116,
740
+ "eval_rougeLsum": 0.4117,
741
+ "eval_runtime": 465.8344,
742
+ "eval_samples_per_second": 4.731,
743
+ "eval_steps_per_second": 1.183,
744
+ "eval_sys_len": 18164,
745
+ "eval_totals_1": 18164,
746
+ "eval_totals_2": 15960,
747
+ "eval_totals_3": 13756,
748
+ "eval_totals_4": 11552,
749
+ "step": 1440
750
+ },
751
+ {
752
+ "epoch": 19.79,
753
+ "step": 1440,
754
+ "total_flos": 4.419252384883016e+17,
755
+ "train_loss": 1.7299000342686972,
756
+ "train_runtime": 27815.7883,
757
+ "train_samples_per_second": 6.697,
758
+ "train_steps_per_second": 0.052
759
+ }
760
+ ],
761
+ "logging_steps": 500,
762
+ "max_steps": 1440,
763
+ "num_train_epochs": 20,
764
+ "save_steps": 500,
765
+ "total_flos": 4.419252384883016e+17,
766
+ "trial_name": null,
767
+ "trial_params": null
768
+ }
training_args.bin ADDED
Binary file (4.66 kB). View file