sheepy928 commited on
Commit
bf1de57
1 Parent(s): 30474ca

Training in progress, step 300, checkpoint

Browse files
checkpoint-300/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6949cc5fe128ea77a8763aca2ba7fdd5811717c63b706afff87623840b7b32f6
3
- size 4747538
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ee96670e1d9bcd7adca0282c0d39735bad3e967dc000e558f5d43e17298ebfd
3
+ size 997351674
checkpoint-300/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:83f15c0f78f1c98015347f8a787c00850813657b8617f853bc200d97f023d094
3
  size 498661166
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67cab1134b2799d45e89fb3f0bbe82e6e8445934df94f82146a3bd71bdfa6bc0
3
  size 498661166
checkpoint-300/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:167002ff49683fca0d12268d9b1d429ff2ab9c99e6a928fa8557947bce839be0
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71102892e2bac0b0446e50f1e8632e478a48cf16c2298ba62ce6df9fa16b4503
3
  size 14244
checkpoint-300/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b1215f332ab24bebfcc1a307cc4f2884f6afb056bdd9310ee11423db2271cea0
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e18669e28aa3a789dda6aed95ba77214761ae88a8f3463ce62d22ffd7afab00
3
  size 1064
checkpoint-300/tokenizer.json CHANGED
@@ -1,21 +1,7 @@
1
  {
2
  "version": "1.0",
3
- "truncation": {
4
- "direction": "Right",
5
- "max_length": 512,
6
- "strategy": "LongestFirst",
7
- "stride": 0
8
- },
9
- "padding": {
10
- "strategy": {
11
- "Fixed": 512
12
- },
13
- "direction": "Right",
14
- "pad_to_multiple_of": null,
15
- "pad_id": 1,
16
- "pad_type_id": 0,
17
- "pad_token": "<pad>"
18
- },
19
  "added_tokens": [
20
  {
21
  "id": 0,
 
1
  {
2
  "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  "added_tokens": [
6
  {
7
  "id": 0,
checkpoint-300/trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
  "epoch": 1.5957446808510638,
5
- "eval_steps": 20,
6
  "global_step": 300,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
@@ -10,364 +10,182 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.05,
13
- "learning_rate": 0.0004995563442768412,
14
- "loss": 1.2913,
15
  "step": 10
16
  },
17
  {
18
  "epoch": 0.11,
19
- "learning_rate": 0.0004986690328305235,
20
- "loss": 1.1578,
21
- "step": 20
22
- },
23
- {
24
- "epoch": 0.11,
25
- "eval_accuracy": 0.7386666666666667,
26
- "eval_combined_score": 0.6626504648943422,
27
- "eval_f1": 0.6276400817995911,
28
- "eval_loss": 0.7813256978988647,
29
- "eval_precision": 0.5456284444444445,
30
- "eval_recall": 0.7386666666666667,
31
- "eval_runtime": 5.7155,
32
- "eval_samples_per_second": 262.444,
33
- "eval_steps_per_second": 8.223,
34
  "step": 20
35
  },
36
  {
37
  "epoch": 0.16,
38
- "learning_rate": 0.0004977817213842058,
39
- "loss": 0.8742,
40
  "step": 30
41
  },
42
  {
43
  "epoch": 0.21,
44
- "learning_rate": 0.0004968944099378882,
45
- "loss": 0.7537,
46
- "step": 40
47
- },
48
- {
49
- "epoch": 0.21,
50
- "eval_accuracy": 0.7386666666666667,
51
- "eval_combined_score": 0.6626504648943422,
52
- "eval_f1": 0.6276400817995911,
53
- "eval_loss": 0.792127251625061,
54
- "eval_precision": 0.5456284444444445,
55
- "eval_recall": 0.7386666666666667,
56
- "eval_runtime": 5.6846,
57
- "eval_samples_per_second": 263.869,
58
- "eval_steps_per_second": 8.268,
59
  "step": 40
60
  },
61
  {
62
  "epoch": 0.27,
63
- "learning_rate": 0.0004960070984915705,
64
- "loss": 0.8076,
65
  "step": 50
66
  },
67
  {
68
  "epoch": 0.32,
69
- "learning_rate": 0.0004951197870452529,
70
- "loss": 0.7436,
71
- "step": 60
72
- },
73
- {
74
- "epoch": 0.32,
75
- "eval_accuracy": 0.7386666666666667,
76
- "eval_combined_score": 0.6626504648943422,
77
- "eval_f1": 0.6276400817995911,
78
- "eval_loss": 0.7419535517692566,
79
- "eval_precision": 0.5456284444444445,
80
- "eval_recall": 0.7386666666666667,
81
- "eval_runtime": 5.7977,
82
- "eval_samples_per_second": 258.725,
83
- "eval_steps_per_second": 8.107,
84
  "step": 60
85
  },
86
  {
87
  "epoch": 0.37,
88
- "learning_rate": 0.0004942324755989353,
89
- "loss": 0.7465,
90
  "step": 70
91
  },
92
  {
93
  "epoch": 0.43,
94
- "learning_rate": 0.0004933451641526176,
95
- "loss": 0.6516,
96
- "step": 80
97
- },
98
- {
99
- "epoch": 0.43,
100
- "eval_accuracy": 0.7386666666666667,
101
- "eval_combined_score": 0.6626504648943422,
102
- "eval_f1": 0.6276400817995911,
103
- "eval_loss": 0.7484750747680664,
104
- "eval_precision": 0.5456284444444445,
105
- "eval_recall": 0.7386666666666667,
106
- "eval_runtime": 5.7051,
107
- "eval_samples_per_second": 262.923,
108
- "eval_steps_per_second": 8.238,
109
  "step": 80
110
  },
111
  {
112
  "epoch": 0.48,
113
- "learning_rate": 0.0004924578527063,
114
- "loss": 0.9634,
115
  "step": 90
116
  },
117
  {
118
  "epoch": 0.53,
119
- "learning_rate": 0.0004915705412599822,
120
- "loss": 0.8011,
121
- "step": 100
122
- },
123
- {
124
- "epoch": 0.53,
125
- "eval_accuracy": 0.7386666666666667,
126
- "eval_combined_score": 0.6626504648943422,
127
- "eval_f1": 0.6276400817995911,
128
- "eval_loss": 0.7428026795387268,
129
- "eval_precision": 0.5456284444444445,
130
- "eval_recall": 0.7386666666666667,
131
- "eval_runtime": 5.7042,
132
- "eval_samples_per_second": 262.962,
133
- "eval_steps_per_second": 8.239,
134
  "step": 100
135
  },
136
  {
137
  "epoch": 0.59,
138
- "learning_rate": 0.0004906832298136646,
139
- "loss": 0.8691,
140
  "step": 110
141
  },
142
  {
143
  "epoch": 0.64,
144
- "learning_rate": 0.0004897959183673469,
145
- "loss": 0.8761,
146
- "step": 120
147
- },
148
- {
149
- "epoch": 0.64,
150
- "eval_accuracy": 0.7386666666666667,
151
- "eval_combined_score": 0.6626504648943422,
152
- "eval_f1": 0.6276400817995911,
153
- "eval_loss": 0.7458600997924805,
154
- "eval_precision": 0.5456284444444445,
155
- "eval_recall": 0.7386666666666667,
156
- "eval_runtime": 5.6685,
157
- "eval_samples_per_second": 264.622,
158
- "eval_steps_per_second": 8.292,
159
  "step": 120
160
  },
161
  {
162
  "epoch": 0.69,
163
- "learning_rate": 0.0004889086069210293,
164
- "loss": 0.828,
165
  "step": 130
166
  },
167
  {
168
  "epoch": 0.74,
169
- "learning_rate": 0.00048802129547471164,
170
- "loss": 0.8708,
171
- "step": 140
172
- },
173
- {
174
- "epoch": 0.74,
175
- "eval_accuracy": 0.7386666666666667,
176
- "eval_combined_score": 0.6626504648943422,
177
- "eval_f1": 0.6276400817995911,
178
- "eval_loss": 0.7820696830749512,
179
- "eval_precision": 0.5456284444444445,
180
- "eval_recall": 0.7386666666666667,
181
- "eval_runtime": 5.6616,
182
- "eval_samples_per_second": 264.945,
183
- "eval_steps_per_second": 8.302,
184
  "step": 140
185
  },
186
  {
187
  "epoch": 0.8,
188
- "learning_rate": 0.000487133984028394,
189
- "loss": 0.7489,
190
  "step": 150
191
  },
192
  {
193
  "epoch": 0.85,
194
- "learning_rate": 0.0004862466725820763,
195
- "loss": 0.9504,
196
- "step": 160
197
- },
198
- {
199
- "epoch": 0.85,
200
- "eval_accuracy": 0.7386666666666667,
201
- "eval_combined_score": 0.6626504648943422,
202
- "eval_f1": 0.6276400817995911,
203
- "eval_loss": 0.7716627717018127,
204
- "eval_precision": 0.5456284444444445,
205
- "eval_recall": 0.7386666666666667,
206
- "eval_runtime": 5.6993,
207
- "eval_samples_per_second": 263.189,
208
- "eval_steps_per_second": 8.247,
209
  "step": 160
210
  },
211
  {
212
  "epoch": 0.9,
213
- "learning_rate": 0.0004853593611357587,
214
- "loss": 0.5649,
215
  "step": 170
216
  },
217
  {
218
  "epoch": 0.96,
219
- "learning_rate": 0.00048447204968944104,
220
- "loss": 1.1222,
221
- "step": 180
222
- },
223
- {
224
- "epoch": 0.96,
225
- "eval_accuracy": 0.7386666666666667,
226
- "eval_combined_score": 0.6626504648943422,
227
- "eval_f1": 0.6276400817995911,
228
- "eval_loss": 0.9907371401786804,
229
- "eval_precision": 0.5456284444444445,
230
- "eval_recall": 0.7386666666666667,
231
- "eval_runtime": 5.6565,
232
- "eval_samples_per_second": 265.181,
233
- "eval_steps_per_second": 8.309,
234
  "step": 180
235
  },
236
  {
237
  "epoch": 1.01,
238
- "learning_rate": 0.00048358473824312333,
239
- "loss": 0.9109,
240
  "step": 190
241
  },
242
  {
243
  "epoch": 1.06,
244
- "learning_rate": 0.00048269742679680566,
245
- "loss": 0.7528,
246
- "step": 200
247
- },
248
- {
249
- "epoch": 1.06,
250
- "eval_accuracy": 0.7386666666666667,
251
- "eval_combined_score": 0.6626504648943422,
252
- "eval_f1": 0.6276400817995911,
253
- "eval_loss": 0.7575691938400269,
254
- "eval_precision": 0.5456284444444445,
255
- "eval_recall": 0.7386666666666667,
256
- "eval_runtime": 5.6695,
257
- "eval_samples_per_second": 264.575,
258
- "eval_steps_per_second": 8.29,
259
  "step": 200
260
  },
261
  {
262
  "epoch": 1.12,
263
- "learning_rate": 0.000481810115350488,
264
- "loss": 0.7253,
265
  "step": 210
266
  },
267
  {
268
  "epoch": 1.17,
269
- "learning_rate": 0.0004809228039041704,
270
- "loss": 0.7923,
271
- "step": 220
272
- },
273
- {
274
- "epoch": 1.17,
275
- "eval_accuracy": 0.7386666666666667,
276
- "eval_combined_score": 0.6626504648943422,
277
- "eval_f1": 0.6276400817995911,
278
- "eval_loss": 0.767310380935669,
279
- "eval_precision": 0.5456284444444445,
280
- "eval_recall": 0.7386666666666667,
281
- "eval_runtime": 5.6737,
282
- "eval_samples_per_second": 264.377,
283
- "eval_steps_per_second": 8.284,
284
  "step": 220
285
  },
286
  {
287
  "epoch": 1.22,
288
- "learning_rate": 0.00048003549245785273,
289
- "loss": 0.7464,
290
  "step": 230
291
  },
292
  {
293
  "epoch": 1.28,
294
- "learning_rate": 0.00047914818101153507,
295
- "loss": 0.7993,
296
- "step": 240
297
- },
298
- {
299
- "epoch": 1.28,
300
- "eval_accuracy": 0.7386666666666667,
301
- "eval_combined_score": 0.6626504648943422,
302
- "eval_f1": 0.6276400817995911,
303
- "eval_loss": 0.7693590521812439,
304
- "eval_precision": 0.5456284444444445,
305
- "eval_recall": 0.7386666666666667,
306
- "eval_runtime": 5.6632,
307
- "eval_samples_per_second": 264.868,
308
- "eval_steps_per_second": 8.299,
309
  "step": 240
310
  },
311
  {
312
  "epoch": 1.33,
313
- "learning_rate": 0.0004782608695652174,
314
- "loss": 0.6849,
315
  "step": 250
316
  },
317
  {
318
  "epoch": 1.38,
319
- "learning_rate": 0.00047737355811889974,
320
- "loss": 0.8644,
321
- "step": 260
322
- },
323
- {
324
- "epoch": 1.38,
325
- "eval_accuracy": 0.7386666666666667,
326
- "eval_combined_score": 0.6626504648943422,
327
- "eval_f1": 0.6276400817995911,
328
- "eval_loss": 0.7374696731567383,
329
- "eval_precision": 0.5456284444444445,
330
- "eval_recall": 0.7386666666666667,
331
- "eval_runtime": 5.6531,
332
- "eval_samples_per_second": 265.339,
333
- "eval_steps_per_second": 8.314,
334
  "step": 260
335
  },
336
  {
337
  "epoch": 1.44,
338
- "learning_rate": 0.00047648624667258213,
339
- "loss": 0.7324,
340
  "step": 270
341
  },
342
  {
343
  "epoch": 1.49,
344
- "learning_rate": 0.0004755989352262644,
345
- "loss": 0.7368,
346
- "step": 280
347
- },
348
- {
349
- "epoch": 1.49,
350
- "eval_accuracy": 0.7386666666666667,
351
- "eval_combined_score": 0.6626504648943422,
352
- "eval_f1": 0.6276400817995911,
353
- "eval_loss": 0.7444203495979309,
354
- "eval_precision": 0.5456284444444445,
355
- "eval_recall": 0.7386666666666667,
356
- "eval_runtime": 5.6457,
357
- "eval_samples_per_second": 265.688,
358
- "eval_steps_per_second": 8.325,
359
  "step": 280
360
  },
361
  {
362
  "epoch": 1.54,
363
- "learning_rate": 0.00047471162377994675,
364
- "loss": 0.6645,
365
  "step": 290
366
  },
367
  {
368
  "epoch": 1.6,
369
- "learning_rate": 0.0004738243123336291,
370
- "loss": 0.7621,
371
  "step": 300
372
  },
373
  {
@@ -375,19 +193,19 @@
375
  "eval_accuracy": 0.7386666666666667,
376
  "eval_combined_score": 0.6626504648943422,
377
  "eval_f1": 0.6276400817995911,
378
- "eval_loss": 0.7361482977867126,
379
  "eval_precision": 0.5456284444444445,
380
  "eval_recall": 0.7386666666666667,
381
- "eval_runtime": 5.6438,
382
- "eval_samples_per_second": 265.777,
383
- "eval_steps_per_second": 8.328,
384
  "step": 300
385
  }
386
  ],
387
  "logging_steps": 10,
388
  "max_steps": 5640,
389
  "num_train_epochs": 30,
390
- "save_steps": 100,
391
  "total_flos": 630419726794752.0,
392
  "trial_name": null,
393
  "trial_params": null
 
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
  "epoch": 1.5957446808510638,
5
+ "eval_steps": 300,
6
  "global_step": 300,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.05,
13
+ "learning_rate": 5e-07,
14
+ "loss": 1.0874,
15
  "step": 10
16
  },
17
  {
18
  "epoch": 0.11,
19
+ "learning_rate": 1e-06,
20
+ "loss": 0.9487,
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  "step": 20
22
  },
23
  {
24
  "epoch": 0.16,
25
+ "learning_rate": 1.5e-06,
26
+ "loss": 0.7586,
27
  "step": 30
28
  },
29
  {
30
  "epoch": 0.21,
31
+ "learning_rate": 2e-06,
32
+ "loss": 0.7225,
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  "step": 40
34
  },
35
  {
36
  "epoch": 0.27,
37
+ "learning_rate": 2.5e-06,
38
+ "loss": 0.7364,
39
  "step": 50
40
  },
41
  {
42
  "epoch": 0.32,
43
+ "learning_rate": 3e-06,
44
+ "loss": 0.7265,
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  "step": 60
46
  },
47
  {
48
  "epoch": 0.37,
49
+ "learning_rate": 3.5e-06,
50
+ "loss": 0.7267,
51
  "step": 70
52
  },
53
  {
54
  "epoch": 0.43,
55
+ "learning_rate": 4e-06,
56
+ "loss": 0.5697,
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  "step": 80
58
  },
59
  {
60
  "epoch": 0.48,
61
+ "learning_rate": 4.5e-06,
62
+ "loss": 1.018,
63
  "step": 90
64
  },
65
  {
66
  "epoch": 0.53,
67
+ "learning_rate": 5e-06,
68
+ "loss": 0.7875,
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  "step": 100
70
  },
71
  {
72
  "epoch": 0.59,
73
+ "learning_rate": 5.5e-06,
74
+ "loss": 0.8242,
75
  "step": 110
76
  },
77
  {
78
  "epoch": 0.64,
79
+ "learning_rate": 6e-06,
80
+ "loss": 0.8034,
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  "step": 120
82
  },
83
  {
84
  "epoch": 0.69,
85
+ "learning_rate": 6.5e-06,
86
+ "loss": 0.7717,
87
  "step": 130
88
  },
89
  {
90
  "epoch": 0.74,
91
+ "learning_rate": 7e-06,
92
+ "loss": 0.8337,
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  "step": 140
94
  },
95
  {
96
  "epoch": 0.8,
97
+ "learning_rate": 7.5e-06,
98
+ "loss": 0.6884,
99
  "step": 150
100
  },
101
  {
102
  "epoch": 0.85,
103
+ "learning_rate": 8e-06,
104
+ "loss": 0.9129,
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  "step": 160
106
  },
107
  {
108
  "epoch": 0.9,
109
+ "learning_rate": 8.500000000000002e-06,
110
+ "loss": 0.5637,
111
  "step": 170
112
  },
113
  {
114
  "epoch": 0.96,
115
+ "learning_rate": 9e-06,
116
+ "loss": 1.0458,
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  "step": 180
118
  },
119
  {
120
  "epoch": 1.01,
121
+ "learning_rate": 9.5e-06,
122
+ "loss": 0.9414,
123
  "step": 190
124
  },
125
  {
126
  "epoch": 1.06,
127
+ "learning_rate": 1e-05,
128
+ "loss": 0.6379,
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  "step": 200
130
  },
131
  {
132
  "epoch": 1.12,
133
+ "learning_rate": 1.0500000000000001e-05,
134
+ "loss": 0.9249,
135
  "step": 210
136
  },
137
  {
138
  "epoch": 1.17,
139
+ "learning_rate": 1.1e-05,
140
+ "loss": 0.6944,
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  "step": 220
142
  },
143
  {
144
  "epoch": 1.22,
145
+ "learning_rate": 1.15e-05,
146
+ "loss": 0.9221,
147
  "step": 230
148
  },
149
  {
150
  "epoch": 1.28,
151
+ "learning_rate": 1.2e-05,
152
+ "loss": 0.6475,
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  "step": 240
154
  },
155
  {
156
  "epoch": 1.33,
157
+ "learning_rate": 1.25e-05,
158
+ "loss": 0.7748,
159
  "step": 250
160
  },
161
  {
162
  "epoch": 1.38,
163
+ "learning_rate": 1.3e-05,
164
+ "loss": 0.8705,
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  "step": 260
166
  },
167
  {
168
  "epoch": 1.44,
169
+ "learning_rate": 1.35e-05,
170
+ "loss": 0.7737,
171
  "step": 270
172
  },
173
  {
174
  "epoch": 1.49,
175
+ "learning_rate": 1.4e-05,
176
+ "loss": 0.8643,
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  "step": 280
178
  },
179
  {
180
  "epoch": 1.54,
181
+ "learning_rate": 1.4500000000000002e-05,
182
+ "loss": 0.8428,
183
  "step": 290
184
  },
185
  {
186
  "epoch": 1.6,
187
+ "learning_rate": 1.5e-05,
188
+ "loss": 0.6785,
189
  "step": 300
190
  },
191
  {
 
193
  "eval_accuracy": 0.7386666666666667,
194
  "eval_combined_score": 0.6626504648943422,
195
  "eval_f1": 0.6276400817995911,
196
+ "eval_loss": 0.7930460572242737,
197
  "eval_precision": 0.5456284444444445,
198
  "eval_recall": 0.7386666666666667,
199
+ "eval_runtime": 6.0663,
200
+ "eval_samples_per_second": 247.266,
201
+ "eval_steps_per_second": 7.748,
202
  "step": 300
203
  }
204
  ],
205
  "logging_steps": 10,
206
  "max_steps": 5640,
207
  "num_train_epochs": 30,
208
+ "save_steps": 300,
209
  "total_flos": 630419726794752.0,
210
  "trial_name": null,
211
  "trial_params": null
checkpoint-300/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:94f7cf02005ce236de7bb1fda33a06f2c3053bafb778c1f449d2ec5279f3a3ec
3
  size 4472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05b42247af1886e7e4142c2daf8d1dc0efc9906f62aab884f8ebe56da5f0ce3e
3
  size 4472