dq158 commited on
Commit
d5b2e3b
1 Parent(s): e9e85a3

Training in progress, epoch 1, checkpoint

Browse files
last-checkpoint/README.md CHANGED
@@ -18,6 +18,7 @@ base_model: google/flan-t5-xl
18
 
19
 
20
  - **Developed by:** [More Information Needed]
 
21
  - **Shared by [optional]:** [More Information Needed]
22
  - **Model type:** [More Information Needed]
23
  - **Language(s) (NLP):** [More Information Needed]
@@ -76,7 +77,7 @@ Use the code below to get started with the model.
76
 
77
  ### Training Data
78
 
79
- <!-- This should link to a Data Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
80
 
81
  [More Information Needed]
82
 
@@ -107,7 +108,7 @@ Use the code below to get started with the model.
107
 
108
  #### Testing Data
109
 
110
- <!-- This should link to a Data Card if possible. -->
111
 
112
  [More Information Needed]
113
 
@@ -204,4 +205,4 @@ Carbon emissions can be estimated using the [Machine Learning Impact calculator]
204
  ### Framework versions
205
 
206
 
207
- - PEFT 0.6.0
 
18
 
19
 
20
  - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
  - **Shared by [optional]:** [More Information Needed]
23
  - **Model type:** [More Information Needed]
24
  - **Language(s) (NLP):** [More Information Needed]
 
77
 
78
  ### Training Data
79
 
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
 
82
  [More Information Needed]
83
 
 
108
 
109
  #### Testing Data
110
 
111
+ <!-- This should link to a Dataset Card if possible. -->
112
 
113
  [More Information Needed]
114
 
 
205
  ### Framework versions
206
 
207
 
208
+ - PEFT 0.6.2
last-checkpoint/adapter_config.json CHANGED
@@ -16,8 +16,8 @@
16
  "rank_pattern": {},
17
  "revision": null,
18
  "target_modules": [
19
- "q",
20
- "v"
21
  ],
22
  "task_type": "SEQ_2_SEQ_LM"
23
  }
 
16
  "rank_pattern": {},
17
  "revision": null,
18
  "target_modules": [
19
+ "v",
20
+ "q"
21
  ],
22
  "task_type": "SEQ_2_SEQ_LM"
23
  }
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b127cfb645d1c75f132a876553609b5374befd9def67d558e5974fa97067d21c
3
  size 37789864
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4602760b59b06299e933de1cabee7a847e392fa5c40fee852f8918f5b35ad215
3
  size 37789864
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:853fdc84d3fd43924101f3b600fbd9f7fc43eb8341b64b2118c78dce07ce9bc5
3
- size 1256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3edb87d3ecd16a7429d7ac352d010d359f3a0dff217ec6098b81876c6ebff172
3
+ size 2621690
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3bfea11a2e02465bb531791987f71426cd35b56e7ebc758216b34e89a76ca829
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1da9aa18d7924a07f722f332077e6eae3d36694b90416e0a3981d3bfad8282e
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a2492f9b5d907d6544a13d4d0a642eb64eb32b5925504787877c14c24ec71c83
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c264768290f9849fa6fa417060b5498e9d26e7c5d585205c9e94d1d4cf3a2be
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,248 +1,518 @@
1
  {
2
- "best_metric": 3.0692174434661865,
3
- "best_model_checkpoint": "dq158/pingusPongus/checkpoint-17782",
4
  "epoch": 1.0,
5
  "eval_steps": 500,
6
- "global_step": 17782,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.03,
13
- "learning_rate": 6e-05,
14
- "loss": 3.2033,
15
  "step": 500
16
  },
17
  {
18
- "epoch": 0.06,
19
- "learning_rate": 5.9999869700884375e-05,
20
- "loss": 3.1925,
21
  "step": 1000
22
  },
23
  {
24
- "epoch": 0.08,
25
- "learning_rate": 5.999947880466937e-05,
26
- "loss": 3.1596,
27
  "step": 1500
28
  },
29
  {
30
- "epoch": 0.11,
31
- "learning_rate": 5.999882731475053e-05,
32
- "loss": 3.2032,
33
  "step": 2000
34
  },
35
  {
36
- "epoch": 0.14,
37
- "learning_rate": 5.99979152367871e-05,
38
- "loss": 3.1379,
39
  "step": 2500
40
  },
41
  {
42
- "epoch": 0.17,
43
- "learning_rate": 5.999674257870195e-05,
44
- "loss": 3.1821,
45
  "step": 3000
46
  },
47
  {
48
- "epoch": 0.2,
49
- "learning_rate": 5.9995309350681496e-05,
50
- "loss": 3.165,
51
  "step": 3500
52
  },
53
  {
54
- "epoch": 0.22,
55
- "learning_rate": 5.9993615565175614e-05,
56
- "loss": 3.1682,
57
  "step": 4000
58
  },
59
  {
60
- "epoch": 0.25,
61
- "learning_rate": 5.999166123689758e-05,
62
- "loss": 3.1234,
63
  "step": 4500
64
  },
65
  {
66
- "epoch": 0.28,
67
- "learning_rate": 5.9989446382823863e-05,
68
- "loss": 3.1389,
69
  "step": 5000
70
  },
71
  {
72
- "epoch": 0.31,
73
- "learning_rate": 5.9986971022194026e-05,
74
- "loss": 3.1197,
75
  "step": 5500
76
  },
77
  {
78
- "epoch": 0.34,
79
- "learning_rate": 5.998423517651056e-05,
80
- "loss": 3.2251,
81
  "step": 6000
82
  },
83
  {
84
- "epoch": 0.37,
85
- "learning_rate": 5.998123886953869e-05,
86
- "loss": 3.2011,
87
  "step": 6500
88
  },
89
  {
90
- "epoch": 0.39,
91
- "learning_rate": 5.9977982127306157e-05,
92
- "loss": 3.1573,
93
  "step": 7000
94
  },
95
  {
96
- "epoch": 0.42,
97
- "learning_rate": 5.9974464978103e-05,
98
- "loss": 3.1049,
99
  "step": 7500
100
  },
101
  {
102
- "epoch": 0.45,
103
- "learning_rate": 5.997068745248132e-05,
104
- "loss": 3.1919,
105
  "step": 8000
106
  },
107
  {
108
- "epoch": 0.48,
109
- "learning_rate": 5.996664958325499e-05,
110
- "loss": 3.1192,
111
  "step": 8500
112
  },
113
  {
114
- "epoch": 0.51,
115
- "learning_rate": 5.99623514054994e-05,
116
- "loss": 3.1393,
117
  "step": 9000
118
  },
119
  {
120
- "epoch": 0.53,
121
- "learning_rate": 5.995779295655114e-05,
122
- "loss": 3.1306,
123
  "step": 9500
124
  },
125
  {
126
- "epoch": 0.56,
127
- "learning_rate": 5.995297427600766e-05,
128
- "loss": 3.1719,
129
  "step": 10000
130
  },
131
  {
132
- "epoch": 0.59,
133
- "learning_rate": 5.994789540572695e-05,
134
- "loss": 3.1197,
135
  "step": 10500
136
  },
137
  {
138
- "epoch": 0.62,
139
- "learning_rate": 5.994255638982716e-05,
140
- "loss": 3.1231,
141
  "step": 11000
142
  },
143
  {
144
- "epoch": 0.65,
145
- "learning_rate": 5.9936957274686233e-05,
146
- "loss": 3.1585,
147
  "step": 11500
148
  },
149
  {
150
- "epoch": 0.67,
151
- "learning_rate": 5.9931098108941496e-05,
152
- "loss": 3.2459,
153
  "step": 12000
154
  },
155
  {
156
- "epoch": 0.7,
157
- "learning_rate": 5.9924978943489196e-05,
158
- "loss": 3.1119,
159
  "step": 12500
160
  },
161
  {
162
- "epoch": 0.73,
163
- "learning_rate": 5.991859983148415e-05,
164
- "loss": 3.1499,
165
  "step": 13000
166
  },
167
  {
168
- "epoch": 0.76,
169
- "learning_rate": 5.9911960828339176e-05,
170
- "loss": 3.2634,
171
  "step": 13500
172
  },
173
  {
174
- "epoch": 0.79,
175
- "learning_rate": 5.9905061991724704e-05,
176
- "loss": 3.1048,
177
  "step": 14000
178
  },
179
  {
180
- "epoch": 0.82,
181
- "learning_rate": 5.9897903381568234e-05,
182
- "loss": 3.1717,
183
  "step": 14500
184
  },
185
  {
186
- "epoch": 0.84,
187
- "learning_rate": 5.989048506005378e-05,
188
- "loss": 3.317,
189
  "step": 15000
190
  },
191
  {
192
- "epoch": 0.87,
193
- "learning_rate": 5.98828070916214e-05,
194
- "loss": 3.1535,
195
  "step": 15500
196
  },
197
  {
198
- "epoch": 0.9,
199
- "learning_rate": 5.9874869542966605e-05,
200
- "loss": 3.0698,
201
  "step": 16000
202
  },
203
  {
204
- "epoch": 0.93,
205
- "learning_rate": 5.986667248303975e-05,
206
- "loss": 3.1798,
207
  "step": 16500
208
  },
209
  {
210
- "epoch": 0.96,
211
- "learning_rate": 5.985821598304549e-05,
212
- "loss": 3.0956,
213
  "step": 17000
214
  },
215
  {
216
- "epoch": 0.98,
217
- "learning_rate": 5.984950011644212e-05,
218
- "loss": 3.1267,
219
  "step": 17500
220
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  {
222
  "epoch": 1.0,
223
  "eval_bleu": 1.0,
224
  "eval_brevity_penalty": 1.0,
225
  "eval_length_ratio": 1.0,
226
- "eval_loss": 3.0692174434661865,
227
  "eval_precisions": [
228
  1.0,
229
  1.0,
230
  1.0,
231
  1.0
232
  ],
233
- "eval_reference_length": 2023424,
234
- "eval_runtime": 3477.4176,
235
- "eval_samples_per_second": 1.136,
236
- "eval_steps_per_second": 0.568,
237
- "eval_translation_length": 2023424,
238
- "step": 17782
239
  }
240
  ],
241
  "logging_steps": 500,
242
- "max_steps": 533460,
243
  "num_train_epochs": 30,
244
  "save_steps": 1000,
245
- "total_flos": 3.0517701759310234e+17,
246
  "trial_name": null,
247
  "trial_params": null
248
  }
 
1
  {
2
+ "best_metric": 3.1455512046813965,
3
+ "best_model_checkpoint": "dq158/pingusPongus/checkpoint-40162",
4
  "epoch": 1.0,
5
  "eval_steps": 500,
6
+ "global_step": 40162,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.01,
13
+ "learning_rate": 0.0001,
14
+ "loss": 4.5056,
15
  "step": 500
16
  },
17
  {
18
+ "epoch": 0.02,
19
+ "learning_rate": 9.9999957472774e-05,
20
+ "loss": 3.8512,
21
  "step": 1000
22
  },
23
  {
24
+ "epoch": 0.04,
25
+ "learning_rate": 9.999982989116833e-05,
26
+ "loss": 3.6498,
27
  "step": 1500
28
  },
29
  {
30
+ "epoch": 0.05,
31
+ "learning_rate": 9.999961725540003e-05,
32
+ "loss": 3.6328,
33
  "step": 2000
34
  },
35
  {
36
+ "epoch": 0.06,
37
+ "learning_rate": 9.999931956583082e-05,
38
+ "loss": 3.6162,
39
  "step": 2500
40
  },
41
  {
42
+ "epoch": 0.07,
43
+ "learning_rate": 9.999893682296706e-05,
44
+ "loss": 3.6415,
45
  "step": 3000
46
  },
47
  {
48
+ "epoch": 0.09,
49
+ "learning_rate": 9.999846902745986e-05,
50
+ "loss": 3.6273,
51
  "step": 3500
52
  },
53
  {
54
+ "epoch": 0.1,
55
+ "learning_rate": 9.999791618010498e-05,
56
+ "loss": 3.5555,
57
  "step": 4000
58
  },
59
  {
60
+ "epoch": 0.11,
61
+ "learning_rate": 9.999727828184285e-05,
62
+ "loss": 3.5176,
63
  "step": 4500
64
  },
65
  {
66
+ "epoch": 0.12,
67
+ "learning_rate": 9.99965553337586e-05,
68
+ "loss": 3.4374,
69
  "step": 5000
70
  },
71
  {
72
+ "epoch": 0.14,
73
+ "learning_rate": 9.999574733708204e-05,
74
+ "loss": 3.5565,
75
  "step": 5500
76
  },
77
  {
78
+ "epoch": 0.15,
79
+ "learning_rate": 9.999485429318763e-05,
80
+ "loss": 3.4639,
81
  "step": 6000
82
  },
83
  {
84
+ "epoch": 0.16,
85
+ "learning_rate": 9.99938762035945e-05,
86
+ "loss": 3.3833,
87
  "step": 6500
88
  },
89
  {
90
+ "epoch": 0.17,
91
+ "learning_rate": 9.999281306996651e-05,
92
+ "loss": 3.4726,
93
  "step": 7000
94
  },
95
  {
96
+ "epoch": 0.19,
97
+ "learning_rate": 9.999166489411211e-05,
98
+ "loss": 3.4132,
99
  "step": 7500
100
  },
101
  {
102
+ "epoch": 0.2,
103
+ "learning_rate": 9.999043167798448e-05,
104
+ "loss": 3.3754,
105
  "step": 8000
106
  },
107
  {
108
+ "epoch": 0.21,
109
+ "learning_rate": 9.99891134236814e-05,
110
+ "loss": 3.4333,
111
  "step": 8500
112
  },
113
  {
114
+ "epoch": 0.22,
115
+ "learning_rate": 9.998771013344535e-05,
116
+ "loss": 3.3313,
117
  "step": 9000
118
  },
119
  {
120
+ "epoch": 0.24,
121
+ "learning_rate": 9.998622180966344e-05,
122
+ "loss": 3.3758,
123
  "step": 9500
124
  },
125
  {
126
+ "epoch": 0.25,
127
+ "learning_rate": 9.998464845486746e-05,
128
+ "loss": 3.3814,
129
  "step": 10000
130
  },
131
  {
132
+ "epoch": 0.26,
133
+ "learning_rate": 9.998299007173383e-05,
134
+ "loss": 3.429,
135
  "step": 10500
136
  },
137
  {
138
+ "epoch": 0.27,
139
+ "learning_rate": 9.99812466630836e-05,
140
+ "loss": 3.4547,
141
  "step": 11000
142
  },
143
  {
144
+ "epoch": 0.29,
145
+ "learning_rate": 9.997941823188243e-05,
146
+ "loss": 3.4204,
147
  "step": 11500
148
  },
149
  {
150
+ "epoch": 0.3,
151
+ "learning_rate": 9.99775047812407e-05,
152
+ "loss": 3.3807,
153
  "step": 12000
154
  },
155
  {
156
+ "epoch": 0.31,
157
+ "learning_rate": 9.997550631441332e-05,
158
+ "loss": 3.3749,
159
  "step": 12500
160
  },
161
  {
162
+ "epoch": 0.32,
163
+ "learning_rate": 9.997342283479989e-05,
164
+ "loss": 3.326,
165
  "step": 13000
166
  },
167
  {
168
+ "epoch": 0.34,
169
+ "learning_rate": 9.997125434594458e-05,
170
+ "loss": 3.3626,
171
  "step": 13500
172
  },
173
  {
174
+ "epoch": 0.35,
175
+ "learning_rate": 9.996900085153617e-05,
176
+ "loss": 3.304,
177
  "step": 14000
178
  },
179
  {
180
+ "epoch": 0.36,
181
+ "learning_rate": 9.996666235540808e-05,
182
+ "loss": 3.3671,
183
  "step": 14500
184
  },
185
  {
186
+ "epoch": 0.37,
187
+ "learning_rate": 9.996423886153828e-05,
188
+ "loss": 3.3667,
189
  "step": 15000
190
  },
191
  {
192
+ "epoch": 0.39,
193
+ "learning_rate": 9.996173037404934e-05,
194
+ "loss": 3.321,
195
  "step": 15500
196
  },
197
  {
198
+ "epoch": 0.4,
199
+ "learning_rate": 9.995913689720844e-05,
200
+ "loss": 3.2639,
201
  "step": 16000
202
  },
203
  {
204
+ "epoch": 0.41,
205
+ "learning_rate": 9.995645843542732e-05,
206
+ "loss": 3.3529,
207
  "step": 16500
208
  },
209
  {
210
+ "epoch": 0.42,
211
+ "learning_rate": 9.995369499326228e-05,
212
+ "loss": 3.2607,
213
  "step": 17000
214
  },
215
  {
216
+ "epoch": 0.44,
217
+ "learning_rate": 9.995084657541416e-05,
218
+ "loss": 3.2858,
219
  "step": 17500
220
  },
221
+ {
222
+ "epoch": 0.45,
223
+ "learning_rate": 9.994791318672838e-05,
224
+ "loss": 3.3516,
225
+ "step": 18000
226
+ },
227
+ {
228
+ "epoch": 0.46,
229
+ "learning_rate": 9.994489483219492e-05,
230
+ "loss": 3.2944,
231
+ "step": 18500
232
+ },
233
+ {
234
+ "epoch": 0.47,
235
+ "learning_rate": 9.994179151694824e-05,
236
+ "loss": 3.278,
237
+ "step": 19000
238
+ },
239
+ {
240
+ "epoch": 0.49,
241
+ "learning_rate": 9.993860324626737e-05,
242
+ "loss": 3.2557,
243
+ "step": 19500
244
+ },
245
+ {
246
+ "epoch": 0.5,
247
+ "learning_rate": 9.993533002557585e-05,
248
+ "loss": 3.366,
249
+ "step": 20000
250
+ },
251
+ {
252
+ "epoch": 0.51,
253
+ "learning_rate": 9.99319718604417e-05,
254
+ "loss": 3.2581,
255
+ "step": 20500
256
+ },
257
+ {
258
+ "epoch": 0.52,
259
+ "learning_rate": 9.992852875657746e-05,
260
+ "loss": 3.3302,
261
+ "step": 21000
262
+ },
263
+ {
264
+ "epoch": 0.54,
265
+ "learning_rate": 9.992500071984017e-05,
266
+ "loss": 3.334,
267
+ "step": 21500
268
+ },
269
+ {
270
+ "epoch": 0.55,
271
+ "learning_rate": 9.992138775623132e-05,
272
+ "loss": 3.3146,
273
+ "step": 22000
274
+ },
275
+ {
276
+ "epoch": 0.56,
277
+ "learning_rate": 9.991768987189688e-05,
278
+ "loss": 3.3315,
279
+ "step": 22500
280
+ },
281
+ {
282
+ "epoch": 0.57,
283
+ "learning_rate": 9.991390707312733e-05,
284
+ "loss": 3.3853,
285
+ "step": 23000
286
+ },
287
+ {
288
+ "epoch": 0.59,
289
+ "learning_rate": 9.991003936635747e-05,
290
+ "loss": 3.3447,
291
+ "step": 23500
292
+ },
293
+ {
294
+ "epoch": 0.6,
295
+ "learning_rate": 9.990608675816668e-05,
296
+ "loss": 3.1906,
297
+ "step": 24000
298
+ },
299
+ {
300
+ "epoch": 0.61,
301
+ "learning_rate": 9.990204925527867e-05,
302
+ "loss": 3.3639,
303
+ "step": 24500
304
+ },
305
+ {
306
+ "epoch": 0.62,
307
+ "learning_rate": 9.989792686456158e-05,
308
+ "loss": 3.2723,
309
+ "step": 25000
310
+ },
311
+ {
312
+ "epoch": 0.63,
313
+ "learning_rate": 9.989371959302797e-05,
314
+ "loss": 3.2156,
315
+ "step": 25500
316
+ },
317
+ {
318
+ "epoch": 0.65,
319
+ "learning_rate": 9.988942744783481e-05,
320
+ "loss": 3.3264,
321
+ "step": 26000
322
+ },
323
+ {
324
+ "epoch": 0.66,
325
+ "learning_rate": 9.988505043628337e-05,
326
+ "loss": 3.2336,
327
+ "step": 26500
328
+ },
329
+ {
330
+ "epoch": 0.67,
331
+ "learning_rate": 9.98805885658194e-05,
332
+ "loss": 3.2806,
333
+ "step": 27000
334
+ },
335
+ {
336
+ "epoch": 0.68,
337
+ "learning_rate": 9.98760418440329e-05,
338
+ "loss": 3.3251,
339
+ "step": 27500
340
+ },
341
+ {
342
+ "epoch": 0.7,
343
+ "learning_rate": 9.987141027865825e-05,
344
+ "loss": 3.2188,
345
+ "step": 28000
346
+ },
347
+ {
348
+ "epoch": 0.71,
349
+ "learning_rate": 9.986669387757414e-05,
350
+ "loss": 3.2981,
351
+ "step": 28500
352
+ },
353
+ {
354
+ "epoch": 0.72,
355
+ "learning_rate": 9.986189264880364e-05,
356
+ "loss": 3.3023,
357
+ "step": 29000
358
+ },
359
+ {
360
+ "epoch": 0.73,
361
+ "learning_rate": 9.985700660051403e-05,
362
+ "loss": 3.3271,
363
+ "step": 29500
364
+ },
365
+ {
366
+ "epoch": 0.75,
367
+ "learning_rate": 9.985203574101691e-05,
368
+ "loss": 3.2955,
369
+ "step": 30000
370
+ },
371
+ {
372
+ "epoch": 0.76,
373
+ "learning_rate": 9.984698007876816e-05,
374
+ "loss": 3.3756,
375
+ "step": 30500
376
+ },
377
+ {
378
+ "epoch": 0.77,
379
+ "learning_rate": 9.984183962236792e-05,
380
+ "loss": 3.2936,
381
+ "step": 31000
382
+ },
383
+ {
384
+ "epoch": 0.78,
385
+ "learning_rate": 9.983661438056056e-05,
386
+ "loss": 3.3248,
387
+ "step": 31500
388
+ },
389
+ {
390
+ "epoch": 0.8,
391
+ "learning_rate": 9.983130436223469e-05,
392
+ "loss": 3.2569,
393
+ "step": 32000
394
+ },
395
+ {
396
+ "epoch": 0.81,
397
+ "learning_rate": 9.98259095764231e-05,
398
+ "loss": 3.3616,
399
+ "step": 32500
400
+ },
401
+ {
402
+ "epoch": 0.82,
403
+ "learning_rate": 9.982043003230282e-05,
404
+ "loss": 3.2892,
405
+ "step": 33000
406
+ },
407
+ {
408
+ "epoch": 0.83,
409
+ "learning_rate": 9.981486573919504e-05,
410
+ "loss": 3.2331,
411
+ "step": 33500
412
+ },
413
+ {
414
+ "epoch": 0.85,
415
+ "learning_rate": 9.98092167065651e-05,
416
+ "loss": 3.3242,
417
+ "step": 34000
418
+ },
419
+ {
420
+ "epoch": 0.86,
421
+ "learning_rate": 9.980348294402255e-05,
422
+ "loss": 3.3123,
423
+ "step": 34500
424
+ },
425
+ {
426
+ "epoch": 0.87,
427
+ "learning_rate": 9.9797664461321e-05,
428
+ "loss": 3.2869,
429
+ "step": 35000
430
+ },
431
+ {
432
+ "epoch": 0.88,
433
+ "learning_rate": 9.979176126835821e-05,
434
+ "loss": 3.2566,
435
+ "step": 35500
436
+ },
437
+ {
438
+ "epoch": 0.9,
439
+ "learning_rate": 9.978577337517603e-05,
440
+ "loss": 3.3036,
441
+ "step": 36000
442
+ },
443
+ {
444
+ "epoch": 0.91,
445
+ "learning_rate": 9.977970079196041e-05,
446
+ "loss": 3.299,
447
+ "step": 36500
448
+ },
449
+ {
450
+ "epoch": 0.92,
451
+ "learning_rate": 9.977354352904136e-05,
452
+ "loss": 3.31,
453
+ "step": 37000
454
+ },
455
+ {
456
+ "epoch": 0.93,
457
+ "learning_rate": 9.976730159689292e-05,
458
+ "loss": 3.2459,
459
+ "step": 37500
460
+ },
461
+ {
462
+ "epoch": 0.95,
463
+ "learning_rate": 9.976097500613318e-05,
464
+ "loss": 3.3162,
465
+ "step": 38000
466
+ },
467
+ {
468
+ "epoch": 0.96,
469
+ "learning_rate": 9.975456376752424e-05,
470
+ "loss": 3.3056,
471
+ "step": 38500
472
+ },
473
+ {
474
+ "epoch": 0.97,
475
+ "learning_rate": 9.974806789197216e-05,
476
+ "loss": 3.2404,
477
+ "step": 39000
478
+ },
479
+ {
480
+ "epoch": 0.98,
481
+ "learning_rate": 9.974148739052703e-05,
482
+ "loss": 3.2376,
483
+ "step": 39500
484
+ },
485
+ {
486
+ "epoch": 1.0,
487
+ "learning_rate": 9.973482227438287e-05,
488
+ "loss": 3.356,
489
+ "step": 40000
490
+ },
491
  {
492
  "epoch": 1.0,
493
  "eval_bleu": 1.0,
494
  "eval_brevity_penalty": 1.0,
495
  "eval_length_ratio": 1.0,
496
+ "eval_loss": 3.1455512046813965,
497
  "eval_precisions": [
498
  1.0,
499
  1.0,
500
  1.0,
501
  1.0
502
  ],
503
+ "eval_reference_length": 4569600,
504
+ "eval_runtime": 8741.384,
505
+ "eval_samples_per_second": 1.021,
506
+ "eval_steps_per_second": 0.511,
507
+ "eval_translation_length": 4569600,
508
+ "step": 40162
509
  }
510
  ],
511
  "logging_steps": 500,
512
+ "max_steps": 1204860,
513
  "num_train_epochs": 30,
514
  "save_steps": 1000,
515
+ "total_flos": 6.892848961321697e+17,
516
  "trial_name": null,
517
  "trial_params": null
518
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c3d95e82f3766efe05da841f4af2236c57541bc9acc301ad4339595e386ca971
3
  size 4728
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b62efb5e27cf22233e7e3d90ba4b9b06ff50053e55dc9f07a3ad40dc5b14b43a
3
  size 4728