tyzhu commited on
Commit
c5f2609
1 Parent(s): a0c22e2

Training in progress, epoch 45, checkpoint

Browse files
checkpoint-15346/README.md ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+ ### Framework versions
7
+
8
+
9
+ - PEFT 0.5.0
checkpoint-15346/adapter_config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "fan_in_fan_out": false,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 16,
11
+ "lora_dropout": 0.05,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 16,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "q_proj",
18
+ "k_proj",
19
+ "v_proj",
20
+ "down_proj",
21
+ "gate_proj",
22
+ "up_proj"
23
+ ],
24
+ "task_type": "CAUSAL_LM"
25
+ }
checkpoint-15346/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83587de2188dd5001c88ae727aa83ee9cf91acfe90885e5027115680a94e0fbb
3
+ size 143269386
checkpoint-15346/added_tokens.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "</s>": 2,
3
+ "<s>": 1,
4
+ "<unk>": 0
5
+ }
checkpoint-15346/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38252c1c1541eeeeaad6bcfca0931e77fbba5f1e1f05b40b0d34d185b50daa8b
3
+ size 286590610
checkpoint-15346/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:086c7c8aa8496342ca20899777b695923e0f983d083df0684882b10f124bd69a
3
+ size 15024
checkpoint-15346/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:566861d89e6e426f6adcb047e79984989f0478fe65c32e11ede52f73e70de6ab
3
+ size 15024
checkpoint-15346/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff15396672d3bc8d129c5213a85be9b33bb0d81b5d55ba7532bfe5f2d3deb2bf
3
+ size 15024
checkpoint-15346/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e37a3f4aa8e781dfa704d37bf3a3545f1e05497c18ef7cb3f7472beb97dc34d5
3
+ size 15024
checkpoint-15346/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0e34ef345582038c9b3a8c4c035b773e063b779afcfcaeab100a62fa4006c3f
3
+ size 1064
checkpoint-15346/special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "pad_token": "</s>",
5
+ "unk_token": "<unk>"
6
+ }
checkpoint-15346/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-15346/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
checkpoint-15346/tokenizer_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<unk>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<s>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ }
27
+ },
28
+ "additional_special_tokens": [],
29
+ "bos_token": "<s>",
30
+ "clean_up_tokenization_spaces": false,
31
+ "eos_token": "</s>",
32
+ "legacy": false,
33
+ "model_max_length": 1000000000000000019884624838656,
34
+ "pad_token": "</s>",
35
+ "padding_side": "left",
36
+ "sp_model_kwargs": {},
37
+ "tokenizer_class": "LlamaTokenizer",
38
+ "unk_token": "<unk>",
39
+ "use_default_system_prompt": true
40
+ }
checkpoint-15346/trainer_state.json ADDED
@@ -0,0 +1,1657 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 45.0,
5
+ "eval_steps": 500,
6
+ "global_step": 15346,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.29,
13
+ "learning_rate": 3e-05,
14
+ "loss": 1.574,
15
+ "step": 100
16
+ },
17
+ {
18
+ "epoch": 0.59,
19
+ "learning_rate": 3e-05,
20
+ "loss": 1.474,
21
+ "step": 200
22
+ },
23
+ {
24
+ "epoch": 0.88,
25
+ "learning_rate": 3e-05,
26
+ "loss": 1.4104,
27
+ "step": 300
28
+ },
29
+ {
30
+ "epoch": 1.0,
31
+ "eval_accuracy": 0.4536923076923077,
32
+ "eval_loss": 3.3574578762054443,
33
+ "eval_runtime": 4.4299,
34
+ "eval_samples_per_second": 112.869,
35
+ "eval_steps_per_second": 14.221,
36
+ "step": 341
37
+ },
38
+ {
39
+ "epoch": 1.0,
40
+ "eval_bleu": 0.08840557310176213,
41
+ "eval_exact_match": 0.0,
42
+ "eval_prefix_exact_match": 0.058,
43
+ "step": 341
44
+ },
45
+ {
46
+ "epoch": 1.17,
47
+ "learning_rate": 3e-05,
48
+ "loss": 1.3889,
49
+ "step": 400
50
+ },
51
+ {
52
+ "epoch": 1.46,
53
+ "learning_rate": 3e-05,
54
+ "loss": 1.387,
55
+ "step": 500
56
+ },
57
+ {
58
+ "epoch": 1.76,
59
+ "learning_rate": 3e-05,
60
+ "loss": 1.389,
61
+ "step": 600
62
+ },
63
+ {
64
+ "epoch": 2.0,
65
+ "eval_accuracy": 0.4543589743589744,
66
+ "eval_loss": 3.4179794788360596,
67
+ "eval_runtime": 4.1997,
68
+ "eval_samples_per_second": 119.055,
69
+ "eval_steps_per_second": 15.001,
70
+ "step": 683
71
+ },
72
+ {
73
+ "epoch": 2.0,
74
+ "eval_bleu": 0.09820569565497561,
75
+ "eval_exact_match": 0.0,
76
+ "eval_prefix_exact_match": 0.056,
77
+ "step": 683
78
+ },
79
+ {
80
+ "epoch": 2.05,
81
+ "learning_rate": 3e-05,
82
+ "loss": 1.3725,
83
+ "step": 700
84
+ },
85
+ {
86
+ "epoch": 2.34,
87
+ "learning_rate": 3e-05,
88
+ "loss": 1.3588,
89
+ "step": 800
90
+ },
91
+ {
92
+ "epoch": 2.64,
93
+ "learning_rate": 3e-05,
94
+ "loss": 1.348,
95
+ "step": 900
96
+ },
97
+ {
98
+ "epoch": 2.93,
99
+ "learning_rate": 3e-05,
100
+ "loss": 1.3414,
101
+ "step": 1000
102
+ },
103
+ {
104
+ "epoch": 3.0,
105
+ "eval_accuracy": 0.45476923076923076,
106
+ "eval_loss": 3.511923313140869,
107
+ "eval_runtime": 4.1985,
108
+ "eval_samples_per_second": 119.089,
109
+ "eval_steps_per_second": 15.005,
110
+ "step": 1024
111
+ },
112
+ {
113
+ "epoch": 3.0,
114
+ "eval_bleu": 0.09889831624589397,
115
+ "eval_exact_match": 0.0,
116
+ "eval_prefix_exact_match": 0.056,
117
+ "step": 1024
118
+ },
119
+ {
120
+ "epoch": 3.22,
121
+ "learning_rate": 3e-05,
122
+ "loss": 1.3166,
123
+ "step": 1100
124
+ },
125
+ {
126
+ "epoch": 3.51,
127
+ "learning_rate": 3e-05,
128
+ "loss": 1.3035,
129
+ "step": 1200
130
+ },
131
+ {
132
+ "epoch": 3.81,
133
+ "learning_rate": 3e-05,
134
+ "loss": 1.3002,
135
+ "step": 1300
136
+ },
137
+ {
138
+ "epoch": 4.0,
139
+ "eval_accuracy": 0.45543589743589746,
140
+ "eval_loss": 3.5288071632385254,
141
+ "eval_runtime": 4.4066,
142
+ "eval_samples_per_second": 113.466,
143
+ "eval_steps_per_second": 14.297,
144
+ "step": 1366
145
+ },
146
+ {
147
+ "epoch": 4.0,
148
+ "eval_bleu": 0.11209345982903633,
149
+ "eval_exact_match": 0.0,
150
+ "eval_prefix_exact_match": 0.054,
151
+ "step": 1366
152
+ },
153
+ {
154
+ "epoch": 4.1,
155
+ "learning_rate": 3e-05,
156
+ "loss": 1.2929,
157
+ "step": 1400
158
+ },
159
+ {
160
+ "epoch": 4.39,
161
+ "learning_rate": 3e-05,
162
+ "loss": 1.2693,
163
+ "step": 1500
164
+ },
165
+ {
166
+ "epoch": 4.69,
167
+ "learning_rate": 3e-05,
168
+ "loss": 1.2697,
169
+ "step": 1600
170
+ },
171
+ {
172
+ "epoch": 4.98,
173
+ "learning_rate": 3e-05,
174
+ "loss": 1.2574,
175
+ "step": 1700
176
+ },
177
+ {
178
+ "epoch": 5.0,
179
+ "eval_accuracy": 0.45394871794871794,
180
+ "eval_loss": 3.6893365383148193,
181
+ "eval_runtime": 4.1122,
182
+ "eval_samples_per_second": 121.591,
183
+ "eval_steps_per_second": 15.32,
184
+ "step": 1707
185
+ },
186
+ {
187
+ "epoch": 5.0,
188
+ "eval_bleu": 0.12076940663377221,
189
+ "eval_exact_match": 0.0,
190
+ "eval_prefix_exact_match": 0.056,
191
+ "step": 1707
192
+ },
193
+ {
194
+ "epoch": 5.27,
195
+ "learning_rate": 3e-05,
196
+ "loss": 1.2097,
197
+ "step": 1800
198
+ },
199
+ {
200
+ "epoch": 5.56,
201
+ "learning_rate": 3e-05,
202
+ "loss": 1.2188,
203
+ "step": 1900
204
+ },
205
+ {
206
+ "epoch": 5.86,
207
+ "learning_rate": 3e-05,
208
+ "loss": 1.2258,
209
+ "step": 2000
210
+ },
211
+ {
212
+ "epoch": 6.0,
213
+ "eval_accuracy": 0.45615384615384613,
214
+ "eval_loss": 3.725893497467041,
215
+ "eval_runtime": 4.0989,
216
+ "eval_samples_per_second": 121.984,
217
+ "eval_steps_per_second": 15.37,
218
+ "step": 2049
219
+ },
220
+ {
221
+ "epoch": 6.0,
222
+ "eval_bleu": 0.11884272898137646,
223
+ "eval_exact_match": 0.0,
224
+ "eval_prefix_exact_match": 0.058,
225
+ "step": 2049
226
+ },
227
+ {
228
+ "epoch": 6.15,
229
+ "learning_rate": 3e-05,
230
+ "loss": 1.205,
231
+ "step": 2100
232
+ },
233
+ {
234
+ "epoch": 6.44,
235
+ "learning_rate": 3e-05,
236
+ "loss": 1.1643,
237
+ "step": 2200
238
+ },
239
+ {
240
+ "epoch": 6.73,
241
+ "learning_rate": 3e-05,
242
+ "loss": 1.1844,
243
+ "step": 2300
244
+ },
245
+ {
246
+ "epoch": 7.0,
247
+ "eval_accuracy": 0.45594871794871794,
248
+ "eval_loss": 3.724449396133423,
249
+ "eval_runtime": 4.8158,
250
+ "eval_samples_per_second": 103.825,
251
+ "eval_steps_per_second": 13.082,
252
+ "step": 2390
253
+ },
254
+ {
255
+ "epoch": 7.0,
256
+ "eval_bleu": 0.12407896336995003,
257
+ "eval_exact_match": 0.0,
258
+ "eval_prefix_exact_match": 0.048,
259
+ "step": 2390
260
+ },
261
+ {
262
+ "epoch": 7.03,
263
+ "learning_rate": 3e-05,
264
+ "loss": 1.1771,
265
+ "step": 2400
266
+ },
267
+ {
268
+ "epoch": 7.32,
269
+ "learning_rate": 3e-05,
270
+ "loss": 1.1218,
271
+ "step": 2500
272
+ },
273
+ {
274
+ "epoch": 7.61,
275
+ "learning_rate": 3e-05,
276
+ "loss": 1.1491,
277
+ "step": 2600
278
+ },
279
+ {
280
+ "epoch": 7.91,
281
+ "learning_rate": 3e-05,
282
+ "loss": 1.1363,
283
+ "step": 2700
284
+ },
285
+ {
286
+ "epoch": 8.0,
287
+ "eval_accuracy": 0.4543589743589744,
288
+ "eval_loss": 3.8138701915740967,
289
+ "eval_runtime": 4.1963,
290
+ "eval_samples_per_second": 119.152,
291
+ "eval_steps_per_second": 15.013,
292
+ "step": 2732
293
+ },
294
+ {
295
+ "epoch": 8.0,
296
+ "eval_bleu": 0.11947629327387163,
297
+ "eval_exact_match": 0.0,
298
+ "eval_prefix_exact_match": 0.046,
299
+ "step": 2732
300
+ },
301
+ {
302
+ "epoch": 8.2,
303
+ "learning_rate": 3e-05,
304
+ "loss": 1.0858,
305
+ "step": 2800
306
+ },
307
+ {
308
+ "epoch": 8.49,
309
+ "learning_rate": 3e-05,
310
+ "loss": 1.0951,
311
+ "step": 2900
312
+ },
313
+ {
314
+ "epoch": 8.78,
315
+ "learning_rate": 3e-05,
316
+ "loss": 1.0903,
317
+ "step": 3000
318
+ },
319
+ {
320
+ "epoch": 9.0,
321
+ "eval_accuracy": 0.4524102564102564,
322
+ "eval_loss": 3.9115548133850098,
323
+ "eval_runtime": 4.4011,
324
+ "eval_samples_per_second": 113.609,
325
+ "eval_steps_per_second": 14.315,
326
+ "step": 3073
327
+ },
328
+ {
329
+ "epoch": 9.0,
330
+ "eval_bleu": 0.13426062809240774,
331
+ "eval_exact_match": 0.0,
332
+ "eval_prefix_exact_match": 0.034,
333
+ "step": 3073
334
+ },
335
+ {
336
+ "epoch": 9.08,
337
+ "learning_rate": 3e-05,
338
+ "loss": 1.0822,
339
+ "step": 3100
340
+ },
341
+ {
342
+ "epoch": 9.37,
343
+ "learning_rate": 3e-05,
344
+ "loss": 1.0414,
345
+ "step": 3200
346
+ },
347
+ {
348
+ "epoch": 9.66,
349
+ "learning_rate": 3e-05,
350
+ "loss": 1.0563,
351
+ "step": 3300
352
+ },
353
+ {
354
+ "epoch": 9.96,
355
+ "learning_rate": 3e-05,
356
+ "loss": 1.0538,
357
+ "step": 3400
358
+ },
359
+ {
360
+ "epoch": 10.0,
361
+ "eval_accuracy": 0.4515897435897436,
362
+ "eval_loss": 3.92203426361084,
363
+ "eval_runtime": 4.4105,
364
+ "eval_samples_per_second": 113.366,
365
+ "eval_steps_per_second": 14.284,
366
+ "step": 3415
367
+ },
368
+ {
369
+ "epoch": 10.0,
370
+ "eval_bleu": 0.11780869016319889,
371
+ "eval_exact_match": 0.0,
372
+ "eval_prefix_exact_match": 0.032,
373
+ "step": 3415
374
+ },
375
+ {
376
+ "epoch": 10.25,
377
+ "learning_rate": 3e-05,
378
+ "loss": 0.991,
379
+ "step": 3500
380
+ },
381
+ {
382
+ "epoch": 10.54,
383
+ "learning_rate": 3e-05,
384
+ "loss": 1.0019,
385
+ "step": 3600
386
+ },
387
+ {
388
+ "epoch": 10.83,
389
+ "learning_rate": 3e-05,
390
+ "loss": 0.9971,
391
+ "step": 3700
392
+ },
393
+ {
394
+ "epoch": 11.0,
395
+ "eval_accuracy": 0.45143589743589746,
396
+ "eval_loss": 3.967252492904663,
397
+ "eval_runtime": 4.1003,
398
+ "eval_samples_per_second": 121.943,
399
+ "eval_steps_per_second": 15.365,
400
+ "step": 3756
401
+ },
402
+ {
403
+ "epoch": 11.0,
404
+ "eval_bleu": 0.12407049349871409,
405
+ "eval_exact_match": 0.0,
406
+ "eval_prefix_exact_match": 0.026,
407
+ "step": 3756
408
+ },
409
+ {
410
+ "epoch": 11.13,
411
+ "learning_rate": 3e-05,
412
+ "loss": 0.9861,
413
+ "step": 3800
414
+ },
415
+ {
416
+ "epoch": 11.42,
417
+ "learning_rate": 3e-05,
418
+ "loss": 0.9436,
419
+ "step": 3900
420
+ },
421
+ {
422
+ "epoch": 11.71,
423
+ "learning_rate": 3e-05,
424
+ "loss": 0.9699,
425
+ "step": 4000
426
+ },
427
+ {
428
+ "epoch": 12.0,
429
+ "eval_accuracy": 0.45076923076923076,
430
+ "eval_loss": 4.033609867095947,
431
+ "eval_runtime": 4.8103,
432
+ "eval_samples_per_second": 103.943,
433
+ "eval_steps_per_second": 13.097,
434
+ "step": 4098
435
+ },
436
+ {
437
+ "epoch": 12.0,
438
+ "eval_bleu": 0.12190601380876163,
439
+ "eval_exact_match": 0.0,
440
+ "eval_prefix_exact_match": 0.026,
441
+ "step": 4098
442
+ },
443
+ {
444
+ "epoch": 12.01,
445
+ "learning_rate": 3e-05,
446
+ "loss": 0.9783,
447
+ "step": 4100
448
+ },
449
+ {
450
+ "epoch": 12.3,
451
+ "learning_rate": 3e-05,
452
+ "loss": 0.9064,
453
+ "step": 4200
454
+ },
455
+ {
456
+ "epoch": 12.59,
457
+ "learning_rate": 3e-05,
458
+ "loss": 0.9178,
459
+ "step": 4300
460
+ },
461
+ {
462
+ "epoch": 12.88,
463
+ "learning_rate": 3e-05,
464
+ "loss": 0.9235,
465
+ "step": 4400
466
+ },
467
+ {
468
+ "epoch": 13.0,
469
+ "eval_accuracy": 0.4492820512820513,
470
+ "eval_loss": 4.002023696899414,
471
+ "eval_runtime": 4.8204,
472
+ "eval_samples_per_second": 103.726,
473
+ "eval_steps_per_second": 13.069,
474
+ "step": 4439
475
+ },
476
+ {
477
+ "epoch": 13.0,
478
+ "eval_bleu": 0.13115986163477472,
479
+ "eval_exact_match": 0.0,
480
+ "eval_prefix_exact_match": 0.026,
481
+ "step": 4439
482
+ },
483
+ {
484
+ "epoch": 13.18,
485
+ "learning_rate": 3e-05,
486
+ "loss": 0.8902,
487
+ "step": 4500
488
+ },
489
+ {
490
+ "epoch": 13.47,
491
+ "learning_rate": 3e-05,
492
+ "loss": 0.8703,
493
+ "step": 4600
494
+ },
495
+ {
496
+ "epoch": 13.76,
497
+ "learning_rate": 3e-05,
498
+ "loss": 0.891,
499
+ "step": 4700
500
+ },
501
+ {
502
+ "epoch": 14.0,
503
+ "eval_accuracy": 0.44774358974358974,
504
+ "eval_loss": 4.071566104888916,
505
+ "eval_runtime": 4.8316,
506
+ "eval_samples_per_second": 103.485,
507
+ "eval_steps_per_second": 13.039,
508
+ "step": 4781
509
+ },
510
+ {
511
+ "epoch": 14.0,
512
+ "eval_bleu": 0.12164300660650715,
513
+ "eval_exact_match": 0.0,
514
+ "eval_prefix_exact_match": 0.024,
515
+ "step": 4781
516
+ },
517
+ {
518
+ "epoch": 14.06,
519
+ "learning_rate": 3e-05,
520
+ "loss": 0.8698,
521
+ "step": 4800
522
+ },
523
+ {
524
+ "epoch": 14.35,
525
+ "learning_rate": 3e-05,
526
+ "loss": 0.8338,
527
+ "step": 4900
528
+ },
529
+ {
530
+ "epoch": 14.64,
531
+ "learning_rate": 3e-05,
532
+ "loss": 0.8362,
533
+ "step": 5000
534
+ },
535
+ {
536
+ "epoch": 14.93,
537
+ "learning_rate": 3e-05,
538
+ "loss": 0.845,
539
+ "step": 5100
540
+ },
541
+ {
542
+ "epoch": 15.0,
543
+ "eval_accuracy": 0.44774358974358974,
544
+ "eval_loss": 4.09920597076416,
545
+ "eval_runtime": 4.4174,
546
+ "eval_samples_per_second": 113.19,
547
+ "eval_steps_per_second": 14.262,
548
+ "step": 5122
549
+ },
550
+ {
551
+ "epoch": 15.0,
552
+ "eval_bleu": 0.1224977607294519,
553
+ "eval_exact_match": 0.0,
554
+ "eval_prefix_exact_match": 0.02,
555
+ "step": 5122
556
+ },
557
+ {
558
+ "epoch": 15.23,
559
+ "learning_rate": 3e-05,
560
+ "loss": 0.7981,
561
+ "step": 5200
562
+ },
563
+ {
564
+ "epoch": 15.52,
565
+ "learning_rate": 3e-05,
566
+ "loss": 0.7978,
567
+ "step": 5300
568
+ },
569
+ {
570
+ "epoch": 15.81,
571
+ "learning_rate": 3e-05,
572
+ "loss": 0.8009,
573
+ "step": 5400
574
+ },
575
+ {
576
+ "epoch": 16.0,
577
+ "eval_accuracy": 0.4464102564102564,
578
+ "eval_loss": 4.093270301818848,
579
+ "eval_runtime": 4.102,
580
+ "eval_samples_per_second": 121.892,
581
+ "eval_steps_per_second": 15.358,
582
+ "step": 5464
583
+ },
584
+ {
585
+ "epoch": 16.0,
586
+ "eval_bleu": 0.12201237729153987,
587
+ "eval_exact_match": 0.0,
588
+ "eval_prefix_exact_match": 0.022,
589
+ "step": 5464
590
+ },
591
+ {
592
+ "epoch": 16.11,
593
+ "learning_rate": 3e-05,
594
+ "loss": 0.7948,
595
+ "step": 5500
596
+ },
597
+ {
598
+ "epoch": 16.4,
599
+ "learning_rate": 3e-05,
600
+ "loss": 0.746,
601
+ "step": 5600
602
+ },
603
+ {
604
+ "epoch": 16.69,
605
+ "learning_rate": 3e-05,
606
+ "loss": 0.7627,
607
+ "step": 5700
608
+ },
609
+ {
610
+ "epoch": 16.98,
611
+ "learning_rate": 3e-05,
612
+ "loss": 0.782,
613
+ "step": 5800
614
+ },
615
+ {
616
+ "epoch": 17.0,
617
+ "eval_accuracy": 0.44671794871794873,
618
+ "eval_loss": 4.1283488273620605,
619
+ "eval_runtime": 4.84,
620
+ "eval_samples_per_second": 103.305,
621
+ "eval_steps_per_second": 13.016,
622
+ "step": 5805
623
+ },
624
+ {
625
+ "epoch": 17.0,
626
+ "eval_bleu": 0.1263852710208009,
627
+ "eval_exact_match": 0.0,
628
+ "eval_prefix_exact_match": 0.02,
629
+ "step": 5805
630
+ },
631
+ {
632
+ "epoch": 17.28,
633
+ "learning_rate": 3e-05,
634
+ "loss": 0.7142,
635
+ "step": 5900
636
+ },
637
+ {
638
+ "epoch": 17.57,
639
+ "learning_rate": 3e-05,
640
+ "loss": 0.7253,
641
+ "step": 6000
642
+ },
643
+ {
644
+ "epoch": 17.86,
645
+ "learning_rate": 3e-05,
646
+ "loss": 0.7294,
647
+ "step": 6100
648
+ },
649
+ {
650
+ "epoch": 18.0,
651
+ "eval_accuracy": 0.44564102564102565,
652
+ "eval_loss": 4.164257049560547,
653
+ "eval_runtime": 4.1037,
654
+ "eval_samples_per_second": 121.84,
655
+ "eval_steps_per_second": 15.352,
656
+ "step": 6147
657
+ },
658
+ {
659
+ "epoch": 18.0,
660
+ "eval_bleu": 0.10715019125675011,
661
+ "eval_exact_match": 0.0,
662
+ "eval_prefix_exact_match": 0.022,
663
+ "step": 6147
664
+ },
665
+ {
666
+ "epoch": 18.16,
667
+ "learning_rate": 3e-05,
668
+ "loss": 0.7055,
669
+ "step": 6200
670
+ },
671
+ {
672
+ "epoch": 18.45,
673
+ "learning_rate": 3e-05,
674
+ "loss": 0.672,
675
+ "step": 6300
676
+ },
677
+ {
678
+ "epoch": 18.74,
679
+ "learning_rate": 3e-05,
680
+ "loss": 0.6792,
681
+ "step": 6400
682
+ },
683
+ {
684
+ "epoch": 19.0,
685
+ "eval_accuracy": 0.44487179487179485,
686
+ "eval_loss": 4.185911655426025,
687
+ "eval_runtime": 4.823,
688
+ "eval_samples_per_second": 103.67,
689
+ "eval_steps_per_second": 13.062,
690
+ "step": 6488
691
+ },
692
+ {
693
+ "epoch": 19.0,
694
+ "eval_bleu": 0.11314440439431668,
695
+ "eval_exact_match": 0.0,
696
+ "eval_prefix_exact_match": 0.02,
697
+ "step": 6488
698
+ },
699
+ {
700
+ "epoch": 19.03,
701
+ "learning_rate": 3e-05,
702
+ "loss": 0.7053,
703
+ "step": 6500
704
+ },
705
+ {
706
+ "epoch": 19.33,
707
+ "learning_rate": 3e-05,
708
+ "loss": 0.6448,
709
+ "step": 6600
710
+ },
711
+ {
712
+ "epoch": 19.62,
713
+ "learning_rate": 3e-05,
714
+ "loss": 0.6448,
715
+ "step": 6700
716
+ },
717
+ {
718
+ "epoch": 19.91,
719
+ "learning_rate": 3e-05,
720
+ "loss": 0.6672,
721
+ "step": 6800
722
+ },
723
+ {
724
+ "epoch": 20.0,
725
+ "eval_accuracy": 0.4436923076923077,
726
+ "eval_loss": 4.201004981994629,
727
+ "eval_runtime": 4.1985,
728
+ "eval_samples_per_second": 119.09,
729
+ "eval_steps_per_second": 15.005,
730
+ "step": 6830
731
+ },
732
+ {
733
+ "epoch": 20.0,
734
+ "eval_bleu": 0.11913021835274795,
735
+ "eval_exact_match": 0.0,
736
+ "eval_prefix_exact_match": 0.02,
737
+ "step": 6830
738
+ },
739
+ {
740
+ "epoch": 20.2,
741
+ "learning_rate": 3e-05,
742
+ "loss": 0.6146,
743
+ "step": 6900
744
+ },
745
+ {
746
+ "epoch": 20.5,
747
+ "learning_rate": 3e-05,
748
+ "loss": 0.6102,
749
+ "step": 7000
750
+ },
751
+ {
752
+ "epoch": 20.79,
753
+ "learning_rate": 3e-05,
754
+ "loss": 0.6258,
755
+ "step": 7100
756
+ },
757
+ {
758
+ "epoch": 21.0,
759
+ "eval_accuracy": 0.4428717948717949,
760
+ "eval_loss": 4.230019569396973,
761
+ "eval_runtime": 4.4124,
762
+ "eval_samples_per_second": 113.318,
763
+ "eval_steps_per_second": 14.278,
764
+ "step": 7171
765
+ },
766
+ {
767
+ "epoch": 21.0,
768
+ "eval_bleu": 0.12133891034082767,
769
+ "eval_exact_match": 0.0,
770
+ "eval_prefix_exact_match": 0.022,
771
+ "step": 7171
772
+ },
773
+ {
774
+ "epoch": 21.08,
775
+ "learning_rate": 3e-05,
776
+ "loss": 0.6149,
777
+ "step": 7200
778
+ },
779
+ {
780
+ "epoch": 21.38,
781
+ "learning_rate": 3e-05,
782
+ "loss": 0.5669,
783
+ "step": 7300
784
+ },
785
+ {
786
+ "epoch": 21.67,
787
+ "learning_rate": 3e-05,
788
+ "loss": 0.5923,
789
+ "step": 7400
790
+ },
791
+ {
792
+ "epoch": 21.96,
793
+ "learning_rate": 3e-05,
794
+ "loss": 0.599,
795
+ "step": 7500
796
+ },
797
+ {
798
+ "epoch": 22.0,
799
+ "eval_accuracy": 0.4418974358974359,
800
+ "eval_loss": 4.253176689147949,
801
+ "eval_runtime": 4.4121,
802
+ "eval_samples_per_second": 113.326,
803
+ "eval_steps_per_second": 14.279,
804
+ "step": 7513
805
+ },
806
+ {
807
+ "epoch": 22.0,
808
+ "eval_bleu": 0.1179919389999147,
809
+ "eval_exact_match": 0.0,
810
+ "eval_prefix_exact_match": 0.02,
811
+ "step": 7513
812
+ },
813
+ {
814
+ "epoch": 22.25,
815
+ "learning_rate": 3e-05,
816
+ "loss": 0.536,
817
+ "step": 7600
818
+ },
819
+ {
820
+ "epoch": 22.55,
821
+ "learning_rate": 3e-05,
822
+ "loss": 0.5479,
823
+ "step": 7700
824
+ },
825
+ {
826
+ "epoch": 22.84,
827
+ "learning_rate": 3e-05,
828
+ "loss": 0.5625,
829
+ "step": 7800
830
+ },
831
+ {
832
+ "epoch": 23.0,
833
+ "eval_accuracy": 0.443025641025641,
834
+ "eval_loss": 4.293737411499023,
835
+ "eval_runtime": 4.4185,
836
+ "eval_samples_per_second": 113.162,
837
+ "eval_steps_per_second": 14.258,
838
+ "step": 7854
839
+ },
840
+ {
841
+ "epoch": 23.0,
842
+ "eval_bleu": 0.10699095482247514,
843
+ "eval_exact_match": 0.0,
844
+ "eval_prefix_exact_match": 0.022,
845
+ "step": 7854
846
+ },
847
+ {
848
+ "epoch": 23.13,
849
+ "learning_rate": 3e-05,
850
+ "loss": 0.5409,
851
+ "step": 7900
852
+ },
853
+ {
854
+ "epoch": 23.43,
855
+ "learning_rate": 3e-05,
856
+ "loss": 0.5067,
857
+ "step": 8000
858
+ },
859
+ {
860
+ "epoch": 23.72,
861
+ "learning_rate": 3e-05,
862
+ "loss": 0.5267,
863
+ "step": 8100
864
+ },
865
+ {
866
+ "epoch": 24.0,
867
+ "eval_accuracy": 0.44153846153846155,
868
+ "eval_loss": 4.254815578460693,
869
+ "eval_runtime": 4.2068,
870
+ "eval_samples_per_second": 118.854,
871
+ "eval_steps_per_second": 14.976,
872
+ "step": 8196
873
+ },
874
+ {
875
+ "epoch": 24.0,
876
+ "eval_bleu": 0.10975165733539657,
877
+ "eval_exact_match": 0.0,
878
+ "eval_prefix_exact_match": 0.024,
879
+ "step": 8196
880
+ },
881
+ {
882
+ "epoch": 24.01,
883
+ "learning_rate": 3e-05,
884
+ "loss": 0.5363,
885
+ "step": 8200
886
+ },
887
+ {
888
+ "epoch": 24.3,
889
+ "learning_rate": 3e-05,
890
+ "loss": 0.4806,
891
+ "step": 8300
892
+ },
893
+ {
894
+ "epoch": 24.6,
895
+ "learning_rate": 3e-05,
896
+ "loss": 0.49,
897
+ "step": 8400
898
+ },
899
+ {
900
+ "epoch": 24.89,
901
+ "learning_rate": 3e-05,
902
+ "loss": 0.5004,
903
+ "step": 8500
904
+ },
905
+ {
906
+ "epoch": 25.0,
907
+ "eval_accuracy": 0.44035897435897436,
908
+ "eval_loss": 4.332499027252197,
909
+ "eval_runtime": 4.4052,
910
+ "eval_samples_per_second": 113.503,
911
+ "eval_steps_per_second": 14.301,
912
+ "step": 8537
913
+ },
914
+ {
915
+ "epoch": 25.0,
916
+ "eval_bleu": 0.10401238605841524,
917
+ "eval_exact_match": 0.0,
918
+ "eval_prefix_exact_match": 0.024,
919
+ "step": 8537
920
+ },
921
+ {
922
+ "epoch": 25.18,
923
+ "learning_rate": 3e-05,
924
+ "loss": 0.4746,
925
+ "step": 8600
926
+ },
927
+ {
928
+ "epoch": 25.48,
929
+ "learning_rate": 3e-05,
930
+ "loss": 0.4596,
931
+ "step": 8700
932
+ },
933
+ {
934
+ "epoch": 25.77,
935
+ "learning_rate": 3e-05,
936
+ "loss": 0.4681,
937
+ "step": 8800
938
+ },
939
+ {
940
+ "epoch": 26.0,
941
+ "eval_accuracy": 0.43964102564102564,
942
+ "eval_loss": 4.3162150382995605,
943
+ "eval_runtime": 4.5151,
944
+ "eval_samples_per_second": 110.739,
945
+ "eval_steps_per_second": 13.953,
946
+ "step": 8879
947
+ },
948
+ {
949
+ "epoch": 26.0,
950
+ "eval_bleu": 0.10710979645106765,
951
+ "eval_exact_match": 0.0,
952
+ "eval_prefix_exact_match": 0.026,
953
+ "step": 8879
954
+ },
955
+ {
956
+ "epoch": 26.06,
957
+ "learning_rate": 3e-05,
958
+ "loss": 0.459,
959
+ "step": 8900
960
+ },
961
+ {
962
+ "epoch": 26.35,
963
+ "learning_rate": 3e-05,
964
+ "loss": 0.4242,
965
+ "step": 9000
966
+ },
967
+ {
968
+ "epoch": 26.65,
969
+ "learning_rate": 3e-05,
970
+ "loss": 0.4445,
971
+ "step": 9100
972
+ },
973
+ {
974
+ "epoch": 26.94,
975
+ "learning_rate": 3e-05,
976
+ "loss": 0.4453,
977
+ "step": 9200
978
+ },
979
+ {
980
+ "epoch": 27.0,
981
+ "eval_accuracy": 0.4388205128205128,
982
+ "eval_loss": 4.377138137817383,
983
+ "eval_runtime": 4.8206,
984
+ "eval_samples_per_second": 103.722,
985
+ "eval_steps_per_second": 13.069,
986
+ "step": 9220
987
+ },
988
+ {
989
+ "epoch": 27.0,
990
+ "eval_bleu": 0.10010720174849254,
991
+ "eval_exact_match": 0.0,
992
+ "eval_prefix_exact_match": 0.026,
993
+ "step": 9220
994
+ },
995
+ {
996
+ "epoch": 27.23,
997
+ "learning_rate": 3e-05,
998
+ "loss": 0.4066,
999
+ "step": 9300
1000
+ },
1001
+ {
1002
+ "epoch": 27.53,
1003
+ "learning_rate": 3e-05,
1004
+ "loss": 0.4153,
1005
+ "step": 9400
1006
+ },
1007
+ {
1008
+ "epoch": 27.82,
1009
+ "learning_rate": 3e-05,
1010
+ "loss": 0.4161,
1011
+ "step": 9500
1012
+ },
1013
+ {
1014
+ "epoch": 28.0,
1015
+ "eval_accuracy": 0.43861538461538463,
1016
+ "eval_loss": 4.405981063842773,
1017
+ "eval_runtime": 4.4101,
1018
+ "eval_samples_per_second": 113.376,
1019
+ "eval_steps_per_second": 14.285,
1020
+ "step": 9562
1021
+ },
1022
+ {
1023
+ "epoch": 28.0,
1024
+ "eval_bleu": 0.10423428286785764,
1025
+ "eval_exact_match": 0.0,
1026
+ "eval_prefix_exact_match": 0.026,
1027
+ "step": 9562
1028
+ },
1029
+ {
1030
+ "epoch": 28.11,
1031
+ "learning_rate": 3e-05,
1032
+ "loss": 0.398,
1033
+ "step": 9600
1034
+ },
1035
+ {
1036
+ "epoch": 28.4,
1037
+ "learning_rate": 3e-05,
1038
+ "loss": 0.3788,
1039
+ "step": 9700
1040
+ },
1041
+ {
1042
+ "epoch": 28.7,
1043
+ "learning_rate": 3e-05,
1044
+ "loss": 0.3943,
1045
+ "step": 9800
1046
+ },
1047
+ {
1048
+ "epoch": 28.99,
1049
+ "learning_rate": 3e-05,
1050
+ "loss": 0.3994,
1051
+ "step": 9900
1052
+ },
1053
+ {
1054
+ "epoch": 29.0,
1055
+ "eval_accuracy": 0.4376923076923077,
1056
+ "eval_loss": 4.468777656555176,
1057
+ "eval_runtime": 4.2025,
1058
+ "eval_samples_per_second": 118.976,
1059
+ "eval_steps_per_second": 14.991,
1060
+ "step": 9903
1061
+ },
1062
+ {
1063
+ "epoch": 29.0,
1064
+ "eval_bleu": 0.08529933417800747,
1065
+ "eval_exact_match": 0.0,
1066
+ "eval_prefix_exact_match": 0.02,
1067
+ "step": 9903
1068
+ },
1069
+ {
1070
+ "epoch": 29.28,
1071
+ "learning_rate": 3e-05,
1072
+ "loss": 0.3519,
1073
+ "step": 10000
1074
+ },
1075
+ {
1076
+ "epoch": 29.58,
1077
+ "learning_rate": 3e-05,
1078
+ "loss": 0.3643,
1079
+ "step": 10100
1080
+ },
1081
+ {
1082
+ "epoch": 29.87,
1083
+ "learning_rate": 3e-05,
1084
+ "loss": 0.3695,
1085
+ "step": 10200
1086
+ },
1087
+ {
1088
+ "epoch": 30.0,
1089
+ "eval_accuracy": 0.4376923076923077,
1090
+ "eval_loss": 4.464532375335693,
1091
+ "eval_runtime": 4.164,
1092
+ "eval_samples_per_second": 120.078,
1093
+ "eval_steps_per_second": 15.13,
1094
+ "step": 10245
1095
+ },
1096
+ {
1097
+ "epoch": 30.0,
1098
+ "eval_bleu": 0.08562517234751162,
1099
+ "eval_exact_match": 0.0,
1100
+ "eval_prefix_exact_match": 0.026,
1101
+ "step": 10245
1102
+ },
1103
+ {
1104
+ "epoch": 30.16,
1105
+ "learning_rate": 3e-05,
1106
+ "loss": 0.3474,
1107
+ "step": 10300
1108
+ },
1109
+ {
1110
+ "epoch": 30.45,
1111
+ "learning_rate": 3e-05,
1112
+ "loss": 0.3327,
1113
+ "step": 10400
1114
+ },
1115
+ {
1116
+ "epoch": 30.75,
1117
+ "learning_rate": 3e-05,
1118
+ "loss": 0.3505,
1119
+ "step": 10500
1120
+ },
1121
+ {
1122
+ "epoch": 31.0,
1123
+ "eval_accuracy": 0.4377948717948718,
1124
+ "eval_loss": 4.462403297424316,
1125
+ "eval_runtime": 4.2457,
1126
+ "eval_samples_per_second": 117.767,
1127
+ "eval_steps_per_second": 14.839,
1128
+ "step": 10586
1129
+ },
1130
+ {
1131
+ "epoch": 31.0,
1132
+ "eval_bleu": 0.09623161231398927,
1133
+ "eval_exact_match": 0.0,
1134
+ "eval_prefix_exact_match": 0.022,
1135
+ "step": 10586
1136
+ },
1137
+ {
1138
+ "epoch": 31.04,
1139
+ "learning_rate": 3e-05,
1140
+ "loss": 0.3476,
1141
+ "step": 10600
1142
+ },
1143
+ {
1144
+ "epoch": 31.33,
1145
+ "learning_rate": 3e-05,
1146
+ "loss": 0.3057,
1147
+ "step": 10700
1148
+ },
1149
+ {
1150
+ "epoch": 31.63,
1151
+ "learning_rate": 3e-05,
1152
+ "loss": 0.3224,
1153
+ "step": 10800
1154
+ },
1155
+ {
1156
+ "epoch": 31.92,
1157
+ "learning_rate": 3e-05,
1158
+ "loss": 0.3342,
1159
+ "step": 10900
1160
+ },
1161
+ {
1162
+ "epoch": 32.0,
1163
+ "eval_accuracy": 0.43646153846153846,
1164
+ "eval_loss": 4.46300745010376,
1165
+ "eval_runtime": 4.5326,
1166
+ "eval_samples_per_second": 110.312,
1167
+ "eval_steps_per_second": 13.899,
1168
+ "step": 10928
1169
+ },
1170
+ {
1171
+ "epoch": 32.0,
1172
+ "eval_bleu": 0.09571435454261526,
1173
+ "eval_exact_match": 0.0,
1174
+ "eval_prefix_exact_match": 0.02,
1175
+ "step": 10928
1176
+ },
1177
+ {
1178
+ "epoch": 32.21,
1179
+ "learning_rate": 3e-05,
1180
+ "loss": 0.2943,
1181
+ "step": 11000
1182
+ },
1183
+ {
1184
+ "epoch": 32.5,
1185
+ "learning_rate": 3e-05,
1186
+ "loss": 0.3021,
1187
+ "step": 11100
1188
+ },
1189
+ {
1190
+ "epoch": 32.8,
1191
+ "learning_rate": 3e-05,
1192
+ "loss": 0.3075,
1193
+ "step": 11200
1194
+ },
1195
+ {
1196
+ "epoch": 33.0,
1197
+ "eval_accuracy": 0.4342051282051282,
1198
+ "eval_loss": 4.544379711151123,
1199
+ "eval_runtime": 4.4078,
1200
+ "eval_samples_per_second": 113.435,
1201
+ "eval_steps_per_second": 14.293,
1202
+ "step": 11269
1203
+ },
1204
+ {
1205
+ "epoch": 33.0,
1206
+ "eval_bleu": 0.11219607728661593,
1207
+ "eval_exact_match": 0.0,
1208
+ "eval_prefix_exact_match": 0.016,
1209
+ "step": 11269
1210
+ },
1211
+ {
1212
+ "epoch": 33.09,
1213
+ "learning_rate": 3e-05,
1214
+ "loss": 0.3047,
1215
+ "step": 11300
1216
+ },
1217
+ {
1218
+ "epoch": 33.38,
1219
+ "learning_rate": 3e-05,
1220
+ "loss": 0.2764,
1221
+ "step": 11400
1222
+ },
1223
+ {
1224
+ "epoch": 33.67,
1225
+ "learning_rate": 3e-05,
1226
+ "loss": 0.2833,
1227
+ "step": 11500
1228
+ },
1229
+ {
1230
+ "epoch": 33.97,
1231
+ "learning_rate": 3e-05,
1232
+ "loss": 0.2949,
1233
+ "step": 11600
1234
+ },
1235
+ {
1236
+ "epoch": 34.0,
1237
+ "eval_accuracy": 0.43441025641025643,
1238
+ "eval_loss": 4.548118591308594,
1239
+ "eval_runtime": 4.4066,
1240
+ "eval_samples_per_second": 113.466,
1241
+ "eval_steps_per_second": 14.297,
1242
+ "step": 11611
1243
+ },
1244
+ {
1245
+ "epoch": 34.0,
1246
+ "eval_bleu": 0.10059053111753888,
1247
+ "eval_exact_match": 0.0,
1248
+ "eval_prefix_exact_match": 0.018,
1249
+ "step": 11611
1250
+ },
1251
+ {
1252
+ "epoch": 34.26,
1253
+ "learning_rate": 3e-05,
1254
+ "loss": 0.2604,
1255
+ "step": 11700
1256
+ },
1257
+ {
1258
+ "epoch": 34.55,
1259
+ "learning_rate": 3e-05,
1260
+ "loss": 0.2657,
1261
+ "step": 11800
1262
+ },
1263
+ {
1264
+ "epoch": 34.85,
1265
+ "learning_rate": 3e-05,
1266
+ "loss": 0.2705,
1267
+ "step": 11900
1268
+ },
1269
+ {
1270
+ "epoch": 35.0,
1271
+ "eval_accuracy": 0.4356923076923077,
1272
+ "eval_loss": 4.5613813400268555,
1273
+ "eval_runtime": 4.1998,
1274
+ "eval_samples_per_second": 119.055,
1275
+ "eval_steps_per_second": 15.001,
1276
+ "step": 11952
1277
+ },
1278
+ {
1279
+ "epoch": 35.0,
1280
+ "eval_bleu": 0.10287091260502411,
1281
+ "eval_exact_match": 0.0,
1282
+ "eval_prefix_exact_match": 0.012,
1283
+ "step": 11952
1284
+ },
1285
+ {
1286
+ "epoch": 35.14,
1287
+ "learning_rate": 3e-05,
1288
+ "loss": 0.2625,
1289
+ "step": 12000
1290
+ },
1291
+ {
1292
+ "epoch": 35.43,
1293
+ "learning_rate": 3e-05,
1294
+ "loss": 0.248,
1295
+ "step": 12100
1296
+ },
1297
+ {
1298
+ "epoch": 35.72,
1299
+ "learning_rate": 3e-05,
1300
+ "loss": 0.2554,
1301
+ "step": 12200
1302
+ },
1303
+ {
1304
+ "epoch": 36.0,
1305
+ "eval_accuracy": 0.4339487179487179,
1306
+ "eval_loss": 4.591047763824463,
1307
+ "eval_runtime": 4.1092,
1308
+ "eval_samples_per_second": 121.677,
1309
+ "eval_steps_per_second": 15.331,
1310
+ "step": 12294
1311
+ },
1312
+ {
1313
+ "epoch": 36.0,
1314
+ "eval_bleu": 0.07688483668776659,
1315
+ "eval_exact_match": 0.0,
1316
+ "eval_prefix_exact_match": 0.02,
1317
+ "step": 12294
1318
+ },
1319
+ {
1320
+ "epoch": 36.02,
1321
+ "learning_rate": 3e-05,
1322
+ "loss": 0.2572,
1323
+ "step": 12300
1324
+ },
1325
+ {
1326
+ "epoch": 36.31,
1327
+ "learning_rate": 3e-05,
1328
+ "loss": 0.2276,
1329
+ "step": 12400
1330
+ },
1331
+ {
1332
+ "epoch": 36.6,
1333
+ "learning_rate": 3e-05,
1334
+ "loss": 0.236,
1335
+ "step": 12500
1336
+ },
1337
+ {
1338
+ "epoch": 36.9,
1339
+ "learning_rate": 3e-05,
1340
+ "loss": 0.2428,
1341
+ "step": 12600
1342
+ },
1343
+ {
1344
+ "epoch": 37.0,
1345
+ "eval_accuracy": 0.43323076923076925,
1346
+ "eval_loss": 4.645793914794922,
1347
+ "eval_runtime": 4.4058,
1348
+ "eval_samples_per_second": 113.486,
1349
+ "eval_steps_per_second": 14.299,
1350
+ "step": 12635
1351
+ },
1352
+ {
1353
+ "epoch": 37.0,
1354
+ "eval_bleu": 0.09755678571463763,
1355
+ "eval_exact_match": 0.0,
1356
+ "eval_prefix_exact_match": 0.01,
1357
+ "step": 12635
1358
+ },
1359
+ {
1360
+ "epoch": 37.19,
1361
+ "learning_rate": 3e-05,
1362
+ "loss": 0.2263,
1363
+ "step": 12700
1364
+ },
1365
+ {
1366
+ "epoch": 37.48,
1367
+ "learning_rate": 3e-05,
1368
+ "loss": 0.219,
1369
+ "step": 12800
1370
+ },
1371
+ {
1372
+ "epoch": 37.77,
1373
+ "learning_rate": 3e-05,
1374
+ "loss": 0.2277,
1375
+ "step": 12900
1376
+ },
1377
+ {
1378
+ "epoch": 38.0,
1379
+ "eval_accuracy": 0.4327179487179487,
1380
+ "eval_loss": 4.655264377593994,
1381
+ "eval_runtime": 4.1048,
1382
+ "eval_samples_per_second": 121.809,
1383
+ "eval_steps_per_second": 15.348,
1384
+ "step": 12977
1385
+ },
1386
+ {
1387
+ "epoch": 38.0,
1388
+ "eval_bleu": 0.08910920223304278,
1389
+ "eval_exact_match": 0.0,
1390
+ "eval_prefix_exact_match": 0.016,
1391
+ "step": 12977
1392
+ },
1393
+ {
1394
+ "epoch": 38.07,
1395
+ "learning_rate": 3e-05,
1396
+ "loss": 0.2243,
1397
+ "step": 13000
1398
+ },
1399
+ {
1400
+ "epoch": 38.36,
1401
+ "learning_rate": 3e-05,
1402
+ "loss": 0.2037,
1403
+ "step": 13100
1404
+ },
1405
+ {
1406
+ "epoch": 38.65,
1407
+ "learning_rate": 3e-05,
1408
+ "loss": 0.2127,
1409
+ "step": 13200
1410
+ },
1411
+ {
1412
+ "epoch": 38.95,
1413
+ "learning_rate": 3e-05,
1414
+ "loss": 0.2172,
1415
+ "step": 13300
1416
+ },
1417
+ {
1418
+ "epoch": 39.0,
1419
+ "eval_accuracy": 0.4327692307692308,
1420
+ "eval_loss": 4.707071304321289,
1421
+ "eval_runtime": 4.4091,
1422
+ "eval_samples_per_second": 113.403,
1423
+ "eval_steps_per_second": 14.289,
1424
+ "step": 13318
1425
+ },
1426
+ {
1427
+ "epoch": 39.0,
1428
+ "eval_bleu": 0.08849488892310922,
1429
+ "eval_exact_match": 0.0,
1430
+ "eval_prefix_exact_match": 0.012,
1431
+ "step": 13318
1432
+ },
1433
+ {
1434
+ "epoch": 39.24,
1435
+ "learning_rate": 3e-05,
1436
+ "loss": 0.1976,
1437
+ "step": 13400
1438
+ },
1439
+ {
1440
+ "epoch": 39.53,
1441
+ "learning_rate": 3e-05,
1442
+ "loss": 0.1983,
1443
+ "step": 13500
1444
+ },
1445
+ {
1446
+ "epoch": 39.82,
1447
+ "learning_rate": 3e-05,
1448
+ "loss": 0.2016,
1449
+ "step": 13600
1450
+ },
1451
+ {
1452
+ "epoch": 40.0,
1453
+ "eval_accuracy": 0.4330769230769231,
1454
+ "eval_loss": 4.717998504638672,
1455
+ "eval_runtime": 4.4131,
1456
+ "eval_samples_per_second": 113.298,
1457
+ "eval_steps_per_second": 14.276,
1458
+ "step": 13660
1459
+ },
1460
+ {
1461
+ "epoch": 40.0,
1462
+ "eval_bleu": 0.08922727862327968,
1463
+ "eval_exact_match": 0.0,
1464
+ "eval_prefix_exact_match": 0.018,
1465
+ "step": 13660
1466
+ },
1467
+ {
1468
+ "epoch": 40.12,
1469
+ "learning_rate": 3e-05,
1470
+ "loss": 0.1995,
1471
+ "step": 13700
1472
+ },
1473
+ {
1474
+ "epoch": 40.41,
1475
+ "learning_rate": 3e-05,
1476
+ "loss": 0.1852,
1477
+ "step": 13800
1478
+ },
1479
+ {
1480
+ "epoch": 40.7,
1481
+ "learning_rate": 3e-05,
1482
+ "loss": 0.1915,
1483
+ "step": 13900
1484
+ },
1485
+ {
1486
+ "epoch": 41.0,
1487
+ "learning_rate": 3e-05,
1488
+ "loss": 0.1965,
1489
+ "step": 14000
1490
+ },
1491
+ {
1492
+ "epoch": 41.0,
1493
+ "eval_accuracy": 0.43225641025641026,
1494
+ "eval_loss": 4.756761074066162,
1495
+ "eval_runtime": 4.223,
1496
+ "eval_samples_per_second": 118.4,
1497
+ "eval_steps_per_second": 14.918,
1498
+ "step": 14001
1499
+ },
1500
+ {
1501
+ "epoch": 41.0,
1502
+ "eval_bleu": 0.08298853800328854,
1503
+ "eval_exact_match": 0.0,
1504
+ "eval_prefix_exact_match": 0.01,
1505
+ "step": 14001
1506
+ },
1507
+ {
1508
+ "epoch": 41.29,
1509
+ "learning_rate": 3e-05,
1510
+ "loss": 0.1715,
1511
+ "step": 14100
1512
+ },
1513
+ {
1514
+ "epoch": 41.58,
1515
+ "learning_rate": 3e-05,
1516
+ "loss": 0.1816,
1517
+ "step": 14200
1518
+ },
1519
+ {
1520
+ "epoch": 41.87,
1521
+ "learning_rate": 3e-05,
1522
+ "loss": 0.1851,
1523
+ "step": 14300
1524
+ },
1525
+ {
1526
+ "epoch": 42.0,
1527
+ "eval_accuracy": 0.4321025641025641,
1528
+ "eval_loss": 4.756207466125488,
1529
+ "eval_runtime": 4.826,
1530
+ "eval_samples_per_second": 103.604,
1531
+ "eval_steps_per_second": 13.054,
1532
+ "step": 14343
1533
+ },
1534
+ {
1535
+ "epoch": 42.0,
1536
+ "eval_bleu": 0.075425255663042,
1537
+ "eval_exact_match": 0.0,
1538
+ "eval_prefix_exact_match": 0.004,
1539
+ "step": 14343
1540
+ },
1541
+ {
1542
+ "epoch": 42.17,
1543
+ "learning_rate": 3e-05,
1544
+ "loss": 0.1704,
1545
+ "step": 14400
1546
+ },
1547
+ {
1548
+ "epoch": 42.46,
1549
+ "learning_rate": 3e-05,
1550
+ "loss": 0.1681,
1551
+ "step": 14500
1552
+ },
1553
+ {
1554
+ "epoch": 42.75,
1555
+ "learning_rate": 3e-05,
1556
+ "loss": 0.1739,
1557
+ "step": 14600
1558
+ },
1559
+ {
1560
+ "epoch": 43.0,
1561
+ "eval_accuracy": 0.4316923076923077,
1562
+ "eval_loss": 4.787448406219482,
1563
+ "eval_runtime": 4.1236,
1564
+ "eval_samples_per_second": 121.253,
1565
+ "eval_steps_per_second": 15.278,
1566
+ "step": 14684
1567
+ },
1568
+ {
1569
+ "epoch": 43.0,
1570
+ "eval_bleu": 0.08854177442021546,
1571
+ "eval_exact_match": 0.0,
1572
+ "eval_prefix_exact_match": 0.008,
1573
+ "step": 14684
1574
+ },
1575
+ {
1576
+ "epoch": 43.11,
1577
+ "learning_rate": 3e-05,
1578
+ "loss": 0.1439,
1579
+ "step": 14700
1580
+ },
1581
+ {
1582
+ "epoch": 43.4,
1583
+ "learning_rate": 3e-05,
1584
+ "loss": 0.1552,
1585
+ "step": 14800
1586
+ },
1587
+ {
1588
+ "epoch": 43.69,
1589
+ "learning_rate": 3e-05,
1590
+ "loss": 0.165,
1591
+ "step": 14900
1592
+ },
1593
+ {
1594
+ "epoch": 43.99,
1595
+ "learning_rate": 3e-05,
1596
+ "loss": 0.1719,
1597
+ "step": 15000
1598
+ },
1599
+ {
1600
+ "epoch": 44.0,
1601
+ "eval_accuracy": 0.4323076923076923,
1602
+ "eval_loss": 4.802907943725586,
1603
+ "eval_runtime": 4.2242,
1604
+ "eval_samples_per_second": 118.365,
1605
+ "eval_steps_per_second": 14.914,
1606
+ "step": 15004
1607
+ },
1608
+ {
1609
+ "epoch": 44.0,
1610
+ "eval_bleu": 0.09013518216273568,
1611
+ "eval_exact_match": 0.0,
1612
+ "eval_prefix_exact_match": 0.006,
1613
+ "step": 15004
1614
+ },
1615
+ {
1616
+ "epoch": 44.28,
1617
+ "learning_rate": 3e-05,
1618
+ "loss": 0.1519,
1619
+ "step": 15100
1620
+ },
1621
+ {
1622
+ "epoch": 44.57,
1623
+ "learning_rate": 3e-05,
1624
+ "loss": 0.1562,
1625
+ "step": 15200
1626
+ },
1627
+ {
1628
+ "epoch": 44.87,
1629
+ "learning_rate": 3e-05,
1630
+ "loss": 0.1626,
1631
+ "step": 15300
1632
+ },
1633
+ {
1634
+ "epoch": 45.0,
1635
+ "eval_accuracy": 0.4318461538461538,
1636
+ "eval_loss": 4.781974792480469,
1637
+ "eval_runtime": 4.4138,
1638
+ "eval_samples_per_second": 113.282,
1639
+ "eval_steps_per_second": 14.274,
1640
+ "step": 15346
1641
+ },
1642
+ {
1643
+ "epoch": 45.0,
1644
+ "eval_bleu": 0.09066882849610237,
1645
+ "eval_exact_match": 0.0,
1646
+ "eval_prefix_exact_match": 0.01,
1647
+ "step": 15346
1648
+ }
1649
+ ],
1650
+ "logging_steps": 100,
1651
+ "max_steps": 17050,
1652
+ "num_train_epochs": 50,
1653
+ "save_steps": 500,
1654
+ "total_flos": 3.0390765535237243e+18,
1655
+ "trial_name": null,
1656
+ "trial_params": null
1657
+ }
checkpoint-15346/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea9d6322848cf3f6d565594d0b5abc1e5286d52a1122fb69bdfd52b385976e73
3
+ size 4728