codymd commited on
Commit
235334a
1 Parent(s): c573b77

Updated model with double training data

Browse files
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "opus-mt-fi-fi-finetuned-fi-to-rg/checkpoint-9000",
3
  "_num_labels": 3,
4
  "activation_dropout": 0.0,
5
  "activation_function": "swish",
 
1
  {
2
+ "_name_or_path": "Helsinki-NLP/opus-mt-fi-fi",
3
  "_num_labels": 3,
4
  "activation_dropout": 0.0,
5
  "activation_function": "swish",
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:70407919dbf934382deb1d0c6c1421f28f67d71b87290138b35d2cbcc60d6c42
3
  size 244313404
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e0d9ed67778042a67c313307e662ba8f9add0e624287c37815626401f5b71c0
3
  size 244313404
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:babcedfaa308860d141e6ec8d931022f8dbc63340ef0b36949d40e0223f3ea6e
3
  size 488516218
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0d7fd6466016e195b8d90ee81383f9734e0b069062adfe26bbc990503cc7495
3
  size 488516218
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b651ed337497ccc790f6c9d45dd8062f79869dd55c27d7638335585f82290f7d
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64bdd01ab4c43618c8944f65eaaeb1c4d64a0b88051183e7a9084053fff4c682
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:98f172f931dd0e5c99920641ee2eafa5e22c4ab913eb4a09480a80cecf7634db
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:990e576c97d2c145e18ad15bc9d4c57052f1e2855c1656b07e47d358416b5401
3
  size 1064
special_tokens_map.json CHANGED
@@ -1,23 +1,5 @@
1
  {
2
- "eos_token": {
3
- "content": "</s>",
4
- "lstrip": false,
5
- "normalized": false,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "pad_token": {
10
- "content": "<pad>",
11
- "lstrip": false,
12
- "normalized": false,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "unk_token": {
17
- "content": "<unk>",
18
- "lstrip": false,
19
- "normalized": false,
20
- "rstrip": false,
21
- "single_word": false
22
- }
23
  }
 
1
  {
2
+ "eos_token": "</s>",
3
+ "pad_token": "<pad>",
4
+ "unk_token": "<unk>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  }
trainer_state.json CHANGED
@@ -1,217 +1,773 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 9.911894273127754,
5
  "eval_steps": 500,
6
- "global_step": 9000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.55,
13
- "learning_rate": 4.7246696035242295e-05,
14
- "loss": 2.9573,
15
  "step": 500
16
  },
17
  {
18
- "epoch": 1.0,
19
- "eval_bleu": 17.2961,
20
- "eval_gen_len": 53.6889,
21
- "eval_loss": 2.6540744304656982,
22
- "eval_runtime": 49.9025,
23
- "eval_samples_per_second": 4.509,
24
- "eval_steps_per_second": 0.581,
25
- "step": 908
26
- },
27
- {
28
- "epoch": 1.1,
29
- "learning_rate": 4.449339207048459e-05,
30
- "loss": 2.927,
31
  "step": 1000
32
  },
33
  {
34
- "epoch": 1.65,
35
- "learning_rate": 4.1740088105726874e-05,
36
- "loss": 2.7809,
37
  "step": 1500
38
  },
39
  {
40
- "epoch": 2.0,
41
- "eval_bleu": 19.3527,
42
- "eval_gen_len": 42.6667,
43
- "eval_loss": 2.5852272510528564,
44
- "eval_runtime": 37.69,
45
- "eval_samples_per_second": 5.97,
46
- "eval_steps_per_second": 0.769,
47
- "step": 1816
48
- },
49
- {
50
- "epoch": 2.2,
51
- "learning_rate": 3.898678414096916e-05,
52
- "loss": 2.7165,
53
  "step": 2000
54
  },
55
  {
56
- "epoch": 2.75,
57
- "learning_rate": 3.623348017621145e-05,
58
- "loss": 2.6527,
59
  "step": 2500
60
  },
61
  {
62
- "epoch": 3.0,
63
- "eval_bleu": 18.53,
64
- "eval_gen_len": 44.8044,
65
- "eval_loss": 2.521505355834961,
66
- "eval_runtime": 44.997,
67
- "eval_samples_per_second": 5.0,
68
- "eval_steps_per_second": 0.644,
69
- "step": 2724
70
- },
71
- {
72
- "epoch": 3.3,
73
- "learning_rate": 3.3480176211453745e-05,
74
- "loss": 2.5517,
75
  "step": 3000
76
  },
77
  {
78
- "epoch": 3.85,
79
- "learning_rate": 3.072687224669604e-05,
80
- "loss": 2.5069,
 
 
 
 
 
 
 
 
 
 
81
  "step": 3500
82
  },
83
  {
84
- "epoch": 4.0,
85
- "eval_bleu": 21.0151,
86
- "eval_gen_len": 39.6889,
87
- "eval_loss": 2.480680227279663,
88
- "eval_runtime": 35.165,
89
- "eval_samples_per_second": 6.398,
90
- "eval_steps_per_second": 0.825,
91
- "step": 3632
92
- },
93
- {
94
- "epoch": 4.41,
95
- "learning_rate": 2.7973568281938327e-05,
96
- "loss": 2.4257,
97
  "step": 4000
98
  },
99
  {
100
- "epoch": 4.96,
101
- "learning_rate": 2.522026431718062e-05,
102
- "loss": 2.3976,
103
  "step": 4500
104
  },
105
  {
106
- "epoch": 5.0,
107
- "eval_bleu": 21.4466,
108
- "eval_gen_len": 39.8667,
109
- "eval_loss": 2.4499943256378174,
110
- "eval_runtime": 31.2343,
111
- "eval_samples_per_second": 7.204,
112
- "eval_steps_per_second": 0.928,
113
- "step": 4540
114
- },
115
- {
116
- "epoch": 5.51,
117
- "learning_rate": 2.246696035242291e-05,
118
- "loss": 2.3175,
119
  "step": 5000
120
  },
121
  {
122
- "epoch": 6.0,
123
- "eval_bleu": 20.5303,
124
- "eval_gen_len": 42.0044,
125
- "eval_loss": 2.4212148189544678,
126
- "eval_runtime": 35.5561,
127
- "eval_samples_per_second": 6.328,
128
- "eval_steps_per_second": 0.816,
129
- "step": 5448
130
- },
131
- {
132
- "epoch": 6.06,
133
- "learning_rate": 1.97136563876652e-05,
134
- "loss": 2.3049,
135
  "step": 5500
136
  },
137
  {
138
- "epoch": 6.61,
139
- "learning_rate": 1.696035242290749e-05,
140
- "loss": 2.2268,
141
  "step": 6000
142
  },
143
  {
144
- "epoch": 7.0,
145
- "eval_bleu": 20.9986,
146
- "eval_gen_len": 42.8578,
147
- "eval_loss": 2.4010252952575684,
148
- "eval_runtime": 35.0954,
149
- "eval_samples_per_second": 6.411,
150
- "eval_steps_per_second": 0.826,
151
- "step": 6356
152
- },
153
- {
154
- "epoch": 7.16,
155
- "learning_rate": 1.420704845814978e-05,
156
- "loss": 2.2282,
157
  "step": 6500
158
  },
159
  {
160
- "epoch": 7.71,
161
- "learning_rate": 1.1453744493392071e-05,
162
- "loss": 2.1703,
 
 
 
 
 
 
 
 
 
 
163
  "step": 7000
164
  },
165
  {
166
- "epoch": 8.0,
167
- "eval_bleu": 22.8201,
168
- "eval_gen_len": 36.8267,
169
- "eval_loss": 2.3867766857147217,
170
- "eval_runtime": 23.8243,
171
- "eval_samples_per_second": 9.444,
172
- "eval_steps_per_second": 1.217,
173
- "step": 7264
174
- },
175
- {
176
- "epoch": 8.26,
177
- "learning_rate": 8.700440528634362e-06,
178
- "loss": 2.1589,
179
  "step": 7500
180
  },
181
  {
182
- "epoch": 8.81,
183
- "learning_rate": 5.947136563876652e-06,
184
- "loss": 2.1223,
185
  "step": 8000
186
  },
187
  {
188
- "epoch": 9.0,
189
- "eval_bleu": 22.9591,
190
- "eval_gen_len": 37.3422,
191
- "eval_loss": 2.3790900707244873,
192
- "eval_runtime": 28.3931,
193
- "eval_samples_per_second": 7.924,
194
- "eval_steps_per_second": 1.021,
195
- "step": 8172
196
- },
197
- {
198
- "epoch": 9.36,
199
- "learning_rate": 3.193832599118943e-06,
200
- "loss": 2.0995,
201
  "step": 8500
202
  },
203
  {
204
- "epoch": 9.91,
205
- "learning_rate": 4.4052863436123357e-07,
206
- "loss": 2.1029,
207
  "step": 9000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  }
209
  ],
210
  "logging_steps": 500,
211
- "max_steps": 9080,
212
- "num_train_epochs": 10,
213
  "save_steps": 500,
214
- "total_flos": 2122263810146304.0,
215
  "trial_name": null,
216
  "trial_params": null
217
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 25.31645569620253,
5
  "eval_steps": 500,
6
+ "global_step": 42000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.3,
13
+ "learning_rate": 4.969861362266426e-05,
14
+ "loss": 3.7006,
15
  "step": 500
16
  },
17
  {
18
+ "epoch": 0.6,
19
+ "learning_rate": 4.939722724532851e-05,
20
+ "loss": 3.5872,
 
 
 
 
 
 
 
 
 
 
21
  "step": 1000
22
  },
23
  {
24
+ "epoch": 0.9,
25
+ "learning_rate": 4.909584086799277e-05,
26
+ "loss": 3.4617,
27
  "step": 1500
28
  },
29
  {
30
+ "epoch": 1.0,
31
+ "eval_bleu": 12.3227,
32
+ "eval_gen_len": 54.6075,
33
+ "eval_loss": 3.1113245487213135,
34
+ "eval_runtime": 122.2496,
35
+ "eval_samples_per_second": 3.272,
36
+ "eval_steps_per_second": 0.409,
37
+ "step": 1659
38
+ },
39
+ {
40
+ "epoch": 1.21,
41
+ "learning_rate": 4.8794454490657024e-05,
42
+ "loss": 3.298,
43
  "step": 2000
44
  },
45
  {
46
+ "epoch": 1.51,
47
+ "learning_rate": 4.849306811332128e-05,
48
+ "loss": 3.2018,
49
  "step": 2500
50
  },
51
  {
52
+ "epoch": 1.81,
53
+ "learning_rate": 4.8191681735985535e-05,
54
+ "loss": 3.1014,
 
 
 
 
 
 
 
 
 
 
55
  "step": 3000
56
  },
57
  {
58
+ "epoch": 2.0,
59
+ "eval_bleu": 15.8487,
60
+ "eval_gen_len": 50.1125,
61
+ "eval_loss": 2.8111488819122314,
62
+ "eval_runtime": 92.4044,
63
+ "eval_samples_per_second": 4.329,
64
+ "eval_steps_per_second": 0.541,
65
+ "step": 3318
66
+ },
67
+ {
68
+ "epoch": 2.11,
69
+ "learning_rate": 4.789029535864979e-05,
70
+ "loss": 2.9998,
71
  "step": 3500
72
  },
73
  {
74
+ "epoch": 2.41,
75
+ "learning_rate": 4.7588908981314046e-05,
76
+ "loss": 2.883,
 
 
 
 
 
 
 
 
 
 
77
  "step": 4000
78
  },
79
  {
80
+ "epoch": 2.71,
81
+ "learning_rate": 4.7287522603978304e-05,
82
+ "loss": 2.8409,
83
  "step": 4500
84
  },
85
  {
86
+ "epoch": 3.0,
87
+ "eval_bleu": 20.5509,
88
+ "eval_gen_len": 43.98,
89
+ "eval_loss": 2.617112398147583,
90
+ "eval_runtime": 70.0674,
91
+ "eval_samples_per_second": 5.709,
92
+ "eval_steps_per_second": 0.714,
93
+ "step": 4977
94
+ },
95
+ {
96
+ "epoch": 3.01,
97
+ "learning_rate": 4.6986136226642556e-05,
98
+ "loss": 2.8043,
99
  "step": 5000
100
  },
101
  {
102
+ "epoch": 3.32,
103
+ "learning_rate": 4.6684749849306815e-05,
104
+ "loss": 2.6486,
 
 
 
 
 
 
 
 
 
 
105
  "step": 5500
106
  },
107
  {
108
+ "epoch": 3.62,
109
+ "learning_rate": 4.638336347197107e-05,
110
+ "loss": 2.6127,
111
  "step": 6000
112
  },
113
  {
114
+ "epoch": 3.92,
115
+ "learning_rate": 4.6081977094635326e-05,
116
+ "loss": 2.5718,
 
 
 
 
 
 
 
 
 
 
117
  "step": 6500
118
  },
119
  {
120
+ "epoch": 4.0,
121
+ "eval_bleu": 21.5273,
122
+ "eval_gen_len": 40.8575,
123
+ "eval_loss": 2.4335193634033203,
124
+ "eval_runtime": 62.0368,
125
+ "eval_samples_per_second": 6.448,
126
+ "eval_steps_per_second": 0.806,
127
+ "step": 6636
128
+ },
129
+ {
130
+ "epoch": 4.22,
131
+ "learning_rate": 4.5780590717299585e-05,
132
+ "loss": 2.4535,
133
  "step": 7000
134
  },
135
  {
136
+ "epoch": 4.52,
137
+ "learning_rate": 4.547920433996384e-05,
138
+ "loss": 2.4269,
 
 
 
 
 
 
 
 
 
 
139
  "step": 7500
140
  },
141
  {
142
+ "epoch": 4.82,
143
+ "learning_rate": 4.5177817962628096e-05,
144
+ "loss": 2.3852,
145
  "step": 8000
146
  },
147
  {
148
+ "epoch": 5.0,
149
+ "eval_bleu": 24.0185,
150
+ "eval_gen_len": 38.945,
151
+ "eval_loss": 2.2908990383148193,
152
+ "eval_runtime": 53.5509,
153
+ "eval_samples_per_second": 7.47,
154
+ "eval_steps_per_second": 0.934,
155
+ "step": 8295
156
+ },
157
+ {
158
+ "epoch": 5.12,
159
+ "learning_rate": 4.487643158529235e-05,
160
+ "loss": 2.3305,
161
  "step": 8500
162
  },
163
  {
164
+ "epoch": 5.42,
165
+ "learning_rate": 4.45750452079566e-05,
166
+ "loss": 2.2361,
167
  "step": 9000
168
+ },
169
+ {
170
+ "epoch": 5.73,
171
+ "learning_rate": 4.427365883062086e-05,
172
+ "loss": 2.2201,
173
+ "step": 9500
174
+ },
175
+ {
176
+ "epoch": 6.0,
177
+ "eval_bleu": 25.0722,
178
+ "eval_gen_len": 38.4525,
179
+ "eval_loss": 2.2150681018829346,
180
+ "eval_runtime": 47.2306,
181
+ "eval_samples_per_second": 8.469,
182
+ "eval_steps_per_second": 1.059,
183
+ "step": 9954
184
+ },
185
+ {
186
+ "epoch": 6.03,
187
+ "learning_rate": 4.397227245328511e-05,
188
+ "loss": 2.1955,
189
+ "step": 10000
190
+ },
191
+ {
192
+ "epoch": 6.33,
193
+ "learning_rate": 4.367088607594937e-05,
194
+ "loss": 2.0928,
195
+ "step": 10500
196
+ },
197
+ {
198
+ "epoch": 6.63,
199
+ "learning_rate": 4.336949969861363e-05,
200
+ "loss": 2.0947,
201
+ "step": 11000
202
+ },
203
+ {
204
+ "epoch": 6.93,
205
+ "learning_rate": 4.306811332127788e-05,
206
+ "loss": 2.0583,
207
+ "step": 11500
208
+ },
209
+ {
210
+ "epoch": 7.0,
211
+ "eval_bleu": 26.051,
212
+ "eval_gen_len": 40.0775,
213
+ "eval_loss": 2.1219234466552734,
214
+ "eval_runtime": 55.6386,
215
+ "eval_samples_per_second": 7.189,
216
+ "eval_steps_per_second": 0.899,
217
+ "step": 11613
218
+ },
219
+ {
220
+ "epoch": 7.23,
221
+ "learning_rate": 4.276672694394214e-05,
222
+ "loss": 1.9657,
223
+ "step": 12000
224
+ },
225
+ {
226
+ "epoch": 7.53,
227
+ "learning_rate": 4.246534056660639e-05,
228
+ "loss": 1.9594,
229
+ "step": 12500
230
+ },
231
+ {
232
+ "epoch": 7.84,
233
+ "learning_rate": 4.216395418927065e-05,
234
+ "loss": 1.9464,
235
+ "step": 13000
236
+ },
237
+ {
238
+ "epoch": 8.0,
239
+ "eval_bleu": 27.8486,
240
+ "eval_gen_len": 39.54,
241
+ "eval_loss": 2.0415802001953125,
242
+ "eval_runtime": 50.0785,
243
+ "eval_samples_per_second": 7.987,
244
+ "eval_steps_per_second": 0.998,
245
+ "step": 13272
246
+ },
247
+ {
248
+ "epoch": 8.14,
249
+ "learning_rate": 4.186256781193491e-05,
250
+ "loss": 1.8901,
251
+ "step": 13500
252
+ },
253
+ {
254
+ "epoch": 8.44,
255
+ "learning_rate": 4.1561181434599153e-05,
256
+ "loss": 1.8331,
257
+ "step": 14000
258
+ },
259
+ {
260
+ "epoch": 8.74,
261
+ "learning_rate": 4.125979505726341e-05,
262
+ "loss": 1.8273,
263
+ "step": 14500
264
+ },
265
+ {
266
+ "epoch": 9.0,
267
+ "eval_bleu": 28.6882,
268
+ "eval_gen_len": 38.97,
269
+ "eval_loss": 1.9714975357055664,
270
+ "eval_runtime": 47.8353,
271
+ "eval_samples_per_second": 8.362,
272
+ "eval_steps_per_second": 1.045,
273
+ "step": 14931
274
+ },
275
+ {
276
+ "epoch": 9.04,
277
+ "learning_rate": 4.095840867992767e-05,
278
+ "loss": 1.8071,
279
+ "step": 15000
280
+ },
281
+ {
282
+ "epoch": 9.34,
283
+ "learning_rate": 4.065702230259192e-05,
284
+ "loss": 1.724,
285
+ "step": 15500
286
+ },
287
+ {
288
+ "epoch": 9.64,
289
+ "learning_rate": 4.035563592525618e-05,
290
+ "loss": 1.7173,
291
+ "step": 16000
292
+ },
293
+ {
294
+ "epoch": 9.95,
295
+ "learning_rate": 4.0054249547920434e-05,
296
+ "loss": 1.7341,
297
+ "step": 16500
298
+ },
299
+ {
300
+ "epoch": 10.0,
301
+ "eval_bleu": 29.4158,
302
+ "eval_gen_len": 39.27,
303
+ "eval_loss": 1.922670602798462,
304
+ "eval_runtime": 48.3901,
305
+ "eval_samples_per_second": 8.266,
306
+ "eval_steps_per_second": 1.033,
307
+ "step": 16590
308
+ },
309
+ {
310
+ "epoch": 10.25,
311
+ "learning_rate": 3.975286317058469e-05,
312
+ "loss": 1.6432,
313
+ "step": 17000
314
+ },
315
+ {
316
+ "epoch": 10.55,
317
+ "learning_rate": 3.945147679324895e-05,
318
+ "loss": 1.6414,
319
+ "step": 17500
320
+ },
321
+ {
322
+ "epoch": 10.85,
323
+ "learning_rate": 3.9150090415913203e-05,
324
+ "loss": 1.6285,
325
+ "step": 18000
326
+ },
327
+ {
328
+ "epoch": 11.0,
329
+ "eval_bleu": 29.6336,
330
+ "eval_gen_len": 39.7025,
331
+ "eval_loss": 1.8723887205123901,
332
+ "eval_runtime": 49.1746,
333
+ "eval_samples_per_second": 8.134,
334
+ "eval_steps_per_second": 1.017,
335
+ "step": 18249
336
+ },
337
+ {
338
+ "epoch": 11.15,
339
+ "learning_rate": 3.884870403857746e-05,
340
+ "loss": 1.5753,
341
+ "step": 18500
342
+ },
343
+ {
344
+ "epoch": 11.45,
345
+ "learning_rate": 3.8547317661241714e-05,
346
+ "loss": 1.5525,
347
+ "step": 19000
348
+ },
349
+ {
350
+ "epoch": 11.75,
351
+ "learning_rate": 3.8245931283905966e-05,
352
+ "loss": 1.5466,
353
+ "step": 19500
354
+ },
355
+ {
356
+ "epoch": 12.0,
357
+ "eval_bleu": 31.3296,
358
+ "eval_gen_len": 39.8675,
359
+ "eval_loss": 1.816349744796753,
360
+ "eval_runtime": 49.6256,
361
+ "eval_samples_per_second": 8.06,
362
+ "eval_steps_per_second": 1.008,
363
+ "step": 19908
364
+ },
365
+ {
366
+ "epoch": 12.06,
367
+ "learning_rate": 3.7944544906570225e-05,
368
+ "loss": 1.5254,
369
+ "step": 20000
370
+ },
371
+ {
372
+ "epoch": 12.36,
373
+ "learning_rate": 3.764315852923448e-05,
374
+ "loss": 1.4676,
375
+ "step": 20500
376
+ },
377
+ {
378
+ "epoch": 12.66,
379
+ "learning_rate": 3.7341772151898736e-05,
380
+ "loss": 1.4678,
381
+ "step": 21000
382
+ },
383
+ {
384
+ "epoch": 12.96,
385
+ "learning_rate": 3.7040385774562995e-05,
386
+ "loss": 1.4607,
387
+ "step": 21500
388
+ },
389
+ {
390
+ "epoch": 13.0,
391
+ "eval_bleu": 31.7515,
392
+ "eval_gen_len": 38.405,
393
+ "eval_loss": 1.7929939031600952,
394
+ "eval_runtime": 44.5172,
395
+ "eval_samples_per_second": 8.985,
396
+ "eval_steps_per_second": 1.123,
397
+ "step": 21567
398
+ },
399
+ {
400
+ "epoch": 13.26,
401
+ "learning_rate": 3.6738999397227247e-05,
402
+ "loss": 1.3787,
403
+ "step": 22000
404
+ },
405
+ {
406
+ "epoch": 13.56,
407
+ "learning_rate": 3.6437613019891505e-05,
408
+ "loss": 1.4049,
409
+ "step": 22500
410
+ },
411
+ {
412
+ "epoch": 13.86,
413
+ "learning_rate": 3.613622664255576e-05,
414
+ "loss": 1.385,
415
+ "step": 23000
416
+ },
417
+ {
418
+ "epoch": 14.0,
419
+ "eval_bleu": 32.458,
420
+ "eval_gen_len": 39.4675,
421
+ "eval_loss": 1.7518789768218994,
422
+ "eval_runtime": 49.1331,
423
+ "eval_samples_per_second": 8.141,
424
+ "eval_steps_per_second": 1.018,
425
+ "step": 23226
426
+ },
427
+ {
428
+ "epoch": 14.17,
429
+ "learning_rate": 3.5834840265220016e-05,
430
+ "loss": 1.3403,
431
+ "step": 23500
432
+ },
433
+ {
434
+ "epoch": 14.47,
435
+ "learning_rate": 3.553345388788427e-05,
436
+ "loss": 1.3166,
437
+ "step": 24000
438
+ },
439
+ {
440
+ "epoch": 14.77,
441
+ "learning_rate": 3.523206751054853e-05,
442
+ "loss": 1.321,
443
+ "step": 24500
444
+ },
445
+ {
446
+ "epoch": 15.0,
447
+ "eval_bleu": 32.9411,
448
+ "eval_gen_len": 38.8025,
449
+ "eval_loss": 1.7194263935089111,
450
+ "eval_runtime": 45.6686,
451
+ "eval_samples_per_second": 8.759,
452
+ "eval_steps_per_second": 1.095,
453
+ "step": 24885
454
+ },
455
+ {
456
+ "epoch": 15.07,
457
+ "learning_rate": 3.493068113321278e-05,
458
+ "loss": 1.2976,
459
+ "step": 25000
460
+ },
461
+ {
462
+ "epoch": 15.37,
463
+ "learning_rate": 3.462929475587703e-05,
464
+ "loss": 1.2358,
465
+ "step": 25500
466
+ },
467
+ {
468
+ "epoch": 15.67,
469
+ "learning_rate": 3.432790837854129e-05,
470
+ "loss": 1.2592,
471
+ "step": 26000
472
+ },
473
+ {
474
+ "epoch": 15.97,
475
+ "learning_rate": 3.402652200120555e-05,
476
+ "loss": 1.2662,
477
+ "step": 26500
478
+ },
479
+ {
480
+ "epoch": 16.0,
481
+ "eval_bleu": 33.8478,
482
+ "eval_gen_len": 39.1275,
483
+ "eval_loss": 1.6950603723526,
484
+ "eval_runtime": 49.9911,
485
+ "eval_samples_per_second": 8.001,
486
+ "eval_steps_per_second": 1.0,
487
+ "step": 26544
488
+ },
489
+ {
490
+ "epoch": 16.27,
491
+ "learning_rate": 3.37251356238698e-05,
492
+ "loss": 1.1963,
493
+ "step": 27000
494
+ },
495
+ {
496
+ "epoch": 16.58,
497
+ "learning_rate": 3.342374924653406e-05,
498
+ "loss": 1.2002,
499
+ "step": 27500
500
+ },
501
+ {
502
+ "epoch": 16.88,
503
+ "learning_rate": 3.312236286919831e-05,
504
+ "loss": 1.1939,
505
+ "step": 28000
506
+ },
507
+ {
508
+ "epoch": 17.0,
509
+ "eval_bleu": 34.5277,
510
+ "eval_gen_len": 39.0225,
511
+ "eval_loss": 1.685713529586792,
512
+ "eval_runtime": 49.4943,
513
+ "eval_samples_per_second": 8.082,
514
+ "eval_steps_per_second": 1.01,
515
+ "step": 28203
516
+ },
517
+ {
518
+ "epoch": 17.18,
519
+ "learning_rate": 3.282097649186257e-05,
520
+ "loss": 1.1459,
521
+ "step": 28500
522
+ },
523
+ {
524
+ "epoch": 17.48,
525
+ "learning_rate": 3.251959011452683e-05,
526
+ "loss": 1.1326,
527
+ "step": 29000
528
+ },
529
+ {
530
+ "epoch": 17.78,
531
+ "learning_rate": 3.221820373719108e-05,
532
+ "loss": 1.1406,
533
+ "step": 29500
534
+ },
535
+ {
536
+ "epoch": 18.0,
537
+ "eval_bleu": 35.8691,
538
+ "eval_gen_len": 38.76,
539
+ "eval_loss": 1.6470690965652466,
540
+ "eval_runtime": 45.2962,
541
+ "eval_samples_per_second": 8.831,
542
+ "eval_steps_per_second": 1.104,
543
+ "step": 29862
544
+ },
545
+ {
546
+ "epoch": 18.08,
547
+ "learning_rate": 3.191681735985534e-05,
548
+ "loss": 1.1292,
549
+ "step": 30000
550
+ },
551
+ {
552
+ "epoch": 18.38,
553
+ "learning_rate": 3.161543098251959e-05,
554
+ "loss": 1.071,
555
+ "step": 30500
556
+ },
557
+ {
558
+ "epoch": 18.69,
559
+ "learning_rate": 3.1314044605183844e-05,
560
+ "loss": 1.0918,
561
+ "step": 31000
562
+ },
563
+ {
564
+ "epoch": 18.99,
565
+ "learning_rate": 3.10126582278481e-05,
566
+ "loss": 1.0759,
567
+ "step": 31500
568
+ },
569
+ {
570
+ "epoch": 19.0,
571
+ "eval_bleu": 36.4448,
572
+ "eval_gen_len": 38.6925,
573
+ "eval_loss": 1.6456927061080933,
574
+ "eval_runtime": 46.4772,
575
+ "eval_samples_per_second": 8.606,
576
+ "eval_steps_per_second": 1.076,
577
+ "step": 31521
578
+ },
579
+ {
580
+ "epoch": 19.29,
581
+ "learning_rate": 3.0711271850512355e-05,
582
+ "loss": 1.0193,
583
+ "step": 32000
584
+ },
585
+ {
586
+ "epoch": 19.59,
587
+ "learning_rate": 3.0409885473176613e-05,
588
+ "loss": 1.0248,
589
+ "step": 32500
590
+ },
591
+ {
592
+ "epoch": 19.89,
593
+ "learning_rate": 3.010849909584087e-05,
594
+ "loss": 1.0378,
595
+ "step": 33000
596
+ },
597
+ {
598
+ "epoch": 20.0,
599
+ "eval_bleu": 37.2905,
600
+ "eval_gen_len": 38.945,
601
+ "eval_loss": 1.6285927295684814,
602
+ "eval_runtime": 49.741,
603
+ "eval_samples_per_second": 8.042,
604
+ "eval_steps_per_second": 1.005,
605
+ "step": 33180
606
+ },
607
+ {
608
+ "epoch": 20.19,
609
+ "learning_rate": 2.9807112718505124e-05,
610
+ "loss": 0.9915,
611
+ "step": 33500
612
+ },
613
+ {
614
+ "epoch": 20.49,
615
+ "learning_rate": 2.9505726341169383e-05,
616
+ "loss": 0.9848,
617
+ "step": 34000
618
+ },
619
+ {
620
+ "epoch": 20.8,
621
+ "learning_rate": 2.9204339963833638e-05,
622
+ "loss": 0.9851,
623
+ "step": 34500
624
+ },
625
+ {
626
+ "epoch": 21.0,
627
+ "eval_bleu": 38.4264,
628
+ "eval_gen_len": 38.7175,
629
+ "eval_loss": 1.5997543334960938,
630
+ "eval_runtime": 44.5032,
631
+ "eval_samples_per_second": 8.988,
632
+ "eval_steps_per_second": 1.124,
633
+ "step": 34839
634
+ },
635
+ {
636
+ "epoch": 21.1,
637
+ "learning_rate": 2.8902953586497894e-05,
638
+ "loss": 0.97,
639
+ "step": 35000
640
+ },
641
+ {
642
+ "epoch": 21.4,
643
+ "learning_rate": 2.8601567209162146e-05,
644
+ "loss": 0.9436,
645
+ "step": 35500
646
+ },
647
+ {
648
+ "epoch": 21.7,
649
+ "learning_rate": 2.83001808318264e-05,
650
+ "loss": 0.9372,
651
+ "step": 36000
652
+ },
653
+ {
654
+ "epoch": 22.0,
655
+ "eval_bleu": 37.9614,
656
+ "eval_gen_len": 38.9425,
657
+ "eval_loss": 1.607030987739563,
658
+ "eval_runtime": 47.0014,
659
+ "eval_samples_per_second": 8.51,
660
+ "eval_steps_per_second": 1.064,
661
+ "step": 36498
662
+ },
663
+ {
664
+ "epoch": 22.0,
665
+ "learning_rate": 2.7998794454490656e-05,
666
+ "loss": 0.9437,
667
+ "step": 36500
668
+ },
669
+ {
670
+ "epoch": 22.3,
671
+ "learning_rate": 2.7697408077154912e-05,
672
+ "loss": 0.8917,
673
+ "step": 37000
674
+ },
675
+ {
676
+ "epoch": 22.6,
677
+ "learning_rate": 2.7396021699819167e-05,
678
+ "loss": 0.8692,
679
+ "step": 37500
680
+ },
681
+ {
682
+ "epoch": 22.91,
683
+ "learning_rate": 2.7094635322483426e-05,
684
+ "loss": 0.9191,
685
+ "step": 38000
686
+ },
687
+ {
688
+ "epoch": 23.0,
689
+ "eval_bleu": 38.8655,
690
+ "eval_gen_len": 38.8825,
691
+ "eval_loss": 1.5746939182281494,
692
+ "eval_runtime": 50.4993,
693
+ "eval_samples_per_second": 7.921,
694
+ "eval_steps_per_second": 0.99,
695
+ "step": 38157
696
+ },
697
+ {
698
+ "epoch": 23.21,
699
+ "learning_rate": 2.679324894514768e-05,
700
+ "loss": 0.8555,
701
+ "step": 38500
702
+ },
703
+ {
704
+ "epoch": 23.51,
705
+ "learning_rate": 2.6491862567811937e-05,
706
+ "loss": 0.8533,
707
+ "step": 39000
708
+ },
709
+ {
710
+ "epoch": 23.81,
711
+ "learning_rate": 2.6190476190476192e-05,
712
+ "loss": 0.8673,
713
+ "step": 39500
714
+ },
715
+ {
716
+ "epoch": 24.0,
717
+ "eval_bleu": 39.4605,
718
+ "eval_gen_len": 39.0175,
719
+ "eval_loss": 1.5650146007537842,
720
+ "eval_runtime": 50.8092,
721
+ "eval_samples_per_second": 7.873,
722
+ "eval_steps_per_second": 0.984,
723
+ "step": 39816
724
+ },
725
+ {
726
+ "epoch": 24.11,
727
+ "learning_rate": 2.5889089813140448e-05,
728
+ "loss": 0.841,
729
+ "step": 40000
730
+ },
731
+ {
732
+ "epoch": 24.41,
733
+ "learning_rate": 2.5587703435804706e-05,
734
+ "loss": 0.8155,
735
+ "step": 40500
736
+ },
737
+ {
738
+ "epoch": 24.71,
739
+ "learning_rate": 2.5286317058468955e-05,
740
+ "loss": 0.811,
741
+ "step": 41000
742
+ },
743
+ {
744
+ "epoch": 25.0,
745
+ "eval_bleu": 39.6804,
746
+ "eval_gen_len": 38.77,
747
+ "eval_loss": 1.5603779554367065,
748
+ "eval_runtime": 45.7389,
749
+ "eval_samples_per_second": 8.745,
750
+ "eval_steps_per_second": 1.093,
751
+ "step": 41475
752
+ },
753
+ {
754
+ "epoch": 25.02,
755
+ "learning_rate": 2.4984930681133214e-05,
756
+ "loss": 0.8335,
757
+ "step": 41500
758
+ },
759
+ {
760
+ "epoch": 25.32,
761
+ "learning_rate": 2.468354430379747e-05,
762
+ "loss": 0.7688,
763
+ "step": 42000
764
  }
765
  ],
766
  "logging_steps": 500,
767
+ "max_steps": 82950,
768
+ "num_train_epochs": 50,
769
  "save_steps": 500,
770
+ "total_flos": 1.0127259403812864e+16,
771
  "trial_name": null,
772
  "trial_params": null
773
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dac49814691ca1f48770eb6a35c6117165ddb6d2fb57f0f26e611d5cdd1837e5
3
  size 4728
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc6327ecfefde560bb383925438f7fcfc8ca830dcd7bddc21f875fc51eed7e44
3
  size 4728