tyzhu commited on
Commit
ce362d0
1 Parent(s): 94a8121

Model save

Browse files
Files changed (6) hide show
  1. README.md +23 -23
  2. all_results.json +15 -0
  3. eval_results.json +10 -0
  4. train_results.json +8 -0
  5. trainer_state.json +430 -0
  6. training_args.bin +1 -1
README.md CHANGED
@@ -15,8 +15,8 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  This model was trained from scratch on an unknown dataset.
17
  It achieves the following results on the evaluation set:
18
- - Loss: 0.8055
19
  - Accuracy: 0.7597
 
20
 
21
  ## Model description
22
 
@@ -51,28 +51,28 @@ The following hyperparameters were used during training:
51
 
52
  ### Training results
53
 
54
- | Training Loss | Epoch | Step | Validation Loss | Accuracy |
55
- |:-------------:|:-----:|:----:|:---------------:|:--------:|
56
- | 0.5411 | 1.0 | 187 | 0.3938 | 0.7904 |
57
- | 0.362 | 2.0 | 375 | 0.3804 | 0.7918 |
58
- | 0.3047 | 3.0 | 562 | 0.3934 | 0.7891 |
59
- | 0.2469 | 4.0 | 750 | 0.4226 | 0.7846 |
60
- | 0.2022 | 5.0 | 937 | 0.4661 | 0.7803 |
61
- | 0.1681 | 6.0 | 1125 | 0.5123 | 0.7761 |
62
- | 0.1404 | 7.0 | 1312 | 0.5731 | 0.7721 |
63
- | 0.1197 | 8.0 | 1500 | 0.6075 | 0.7701 |
64
- | 0.1 | 9.0 | 1687 | 0.6317 | 0.7688 |
65
- | 0.089 | 10.0 | 1875 | 0.6718 | 0.7664 |
66
- | 0.0837 | 11.0 | 2062 | 0.6922 | 0.7653 |
67
- | 0.0788 | 12.0 | 2250 | 0.7254 | 0.7632 |
68
- | 0.0761 | 13.0 | 2437 | 0.7256 | 0.7629 |
69
- | 0.0749 | 14.0 | 2625 | 0.7534 | 0.7621 |
70
- | 0.0741 | 15.0 | 2812 | 0.7529 | 0.7620 |
71
- | 0.0726 | 16.0 | 3000 | 0.7678 | 0.7611 |
72
- | 0.0687 | 17.0 | 3187 | 0.7728 | 0.7610 |
73
- | 0.0682 | 18.0 | 3375 | 0.7807 | 0.7603 |
74
- | 0.0682 | 19.0 | 3562 | 0.7872 | 0.7610 |
75
- | 0.0682 | 19.95 | 3740 | 0.8055 | 0.7597 |
76
 
77
 
78
  ### Framework versions
 
15
 
16
  This model was trained from scratch on an unknown dataset.
17
  It achieves the following results on the evaluation set:
 
18
  - Accuracy: 0.7597
19
+ - Loss: 0.8055
20
 
21
  ## Model description
22
 
 
51
 
52
  ### Training results
53
 
54
+ | Training Loss | Epoch | Step | Accuracy | Validation Loss |
55
+ |:-------------:|:-----:|:----:|:--------:|:---------------:|
56
+ | 0.5411 | 1.0 | 187 | 0.7904 | 0.3938 |
57
+ | 0.362 | 2.0 | 375 | 0.7918 | 0.3804 |
58
+ | 0.3047 | 3.0 | 562 | 0.7891 | 0.3934 |
59
+ | 0.2469 | 4.0 | 750 | 0.7846 | 0.4226 |
60
+ | 0.2022 | 5.0 | 937 | 0.7803 | 0.4661 |
61
+ | 0.1681 | 6.0 | 1125 | 0.7761 | 0.5123 |
62
+ | 0.1404 | 7.0 | 1312 | 0.7721 | 0.5731 |
63
+ | 0.1197 | 8.0 | 1500 | 0.7701 | 0.6075 |
64
+ | 0.1 | 9.0 | 1687 | 0.7688 | 0.6317 |
65
+ | 0.089 | 10.0 | 1875 | 0.7664 | 0.6718 |
66
+ | 0.0837 | 11.0 | 2062 | 0.7653 | 0.6922 |
67
+ | 0.0788 | 12.0 | 2250 | 0.7632 | 0.7254 |
68
+ | 0.0761 | 13.0 | 2437 | 0.7629 | 0.7256 |
69
+ | 0.0749 | 14.0 | 2625 | 0.7621 | 0.7534 |
70
+ | 0.0741 | 15.0 | 2812 | 0.7620 | 0.7529 |
71
+ | 0.0726 | 16.0 | 3000 | 0.7611 | 0.7678 |
72
+ | 0.0687 | 17.0 | 3187 | 0.7610 | 0.7728 |
73
+ | 0.0682 | 18.0 | 3375 | 0.7603 | 0.7807 |
74
+ | 0.0682 | 19.0 | 3562 | 0.7610 | 0.7872 |
75
+ | 0.0682 | 19.95 | 3740 | 0.7597 | 0.8055 |
76
 
77
 
78
  ### Framework versions
all_results.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 19.95,
3
+ "eval_accuracy": 0.7597490196078431,
4
+ "eval_loss": 0.8054783940315247,
5
+ "eval_runtime": 6.7173,
6
+ "eval_samples": 500,
7
+ "eval_samples_per_second": 74.434,
8
+ "eval_steps_per_second": 9.379,
9
+ "perplexity": 2.237766777062304,
10
+ "train_loss": 0.1472176224152672,
11
+ "train_runtime": 2965.7973,
12
+ "train_samples": 6000,
13
+ "train_samples_per_second": 40.461,
14
+ "train_steps_per_second": 1.261
15
+ }
eval_results.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 19.95,
3
+ "eval_accuracy": 0.7597490196078431,
4
+ "eval_loss": 0.8054783940315247,
5
+ "eval_runtime": 6.7173,
6
+ "eval_samples": 500,
7
+ "eval_samples_per_second": 74.434,
8
+ "eval_steps_per_second": 9.379,
9
+ "perplexity": 2.237766777062304
10
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 19.95,
3
+ "train_loss": 0.1472176224152672,
4
+ "train_runtime": 2965.7973,
5
+ "train_samples": 6000,
6
+ "train_samples_per_second": 40.461,
7
+ "train_steps_per_second": 1.261
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,430 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 19.946666666666665,
5
+ "eval_steps": 500,
6
+ "global_step": 3740,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.53,
13
+ "learning_rate": 0.0001,
14
+ "loss": 0.5411,
15
+ "step": 100
16
+ },
17
+ {
18
+ "epoch": 1.0,
19
+ "eval_accuracy": 0.7903686274509804,
20
+ "eval_loss": 0.39375776052474976,
21
+ "eval_runtime": 7.7091,
22
+ "eval_samples_per_second": 64.858,
23
+ "eval_steps_per_second": 8.172,
24
+ "step": 187
25
+ },
26
+ {
27
+ "epoch": 1.07,
28
+ "learning_rate": 0.0001,
29
+ "loss": 0.4081,
30
+ "step": 200
31
+ },
32
+ {
33
+ "epoch": 1.6,
34
+ "learning_rate": 0.0001,
35
+ "loss": 0.362,
36
+ "step": 300
37
+ },
38
+ {
39
+ "epoch": 2.0,
40
+ "eval_accuracy": 0.7917960784313726,
41
+ "eval_loss": 0.3804325461387634,
42
+ "eval_runtime": 7.4651,
43
+ "eval_samples_per_second": 66.979,
44
+ "eval_steps_per_second": 8.439,
45
+ "step": 375
46
+ },
47
+ {
48
+ "epoch": 2.13,
49
+ "learning_rate": 0.0001,
50
+ "loss": 0.3376,
51
+ "step": 400
52
+ },
53
+ {
54
+ "epoch": 2.67,
55
+ "learning_rate": 0.0001,
56
+ "loss": 0.3047,
57
+ "step": 500
58
+ },
59
+ {
60
+ "epoch": 3.0,
61
+ "eval_accuracy": 0.7891450980392157,
62
+ "eval_loss": 0.39341700077056885,
63
+ "eval_runtime": 7.1191,
64
+ "eval_samples_per_second": 70.233,
65
+ "eval_steps_per_second": 8.849,
66
+ "step": 562
67
+ },
68
+ {
69
+ "epoch": 3.2,
70
+ "learning_rate": 0.0001,
71
+ "loss": 0.2848,
72
+ "step": 600
73
+ },
74
+ {
75
+ "epoch": 3.73,
76
+ "learning_rate": 0.0001,
77
+ "loss": 0.2469,
78
+ "step": 700
79
+ },
80
+ {
81
+ "epoch": 4.0,
82
+ "eval_accuracy": 0.7846274509803921,
83
+ "eval_loss": 0.42259183526039124,
84
+ "eval_runtime": 6.9941,
85
+ "eval_samples_per_second": 71.489,
86
+ "eval_steps_per_second": 9.008,
87
+ "step": 750
88
+ },
89
+ {
90
+ "epoch": 4.27,
91
+ "learning_rate": 0.0001,
92
+ "loss": 0.2215,
93
+ "step": 800
94
+ },
95
+ {
96
+ "epoch": 4.8,
97
+ "learning_rate": 0.0001,
98
+ "loss": 0.2022,
99
+ "step": 900
100
+ },
101
+ {
102
+ "epoch": 5.0,
103
+ "eval_accuracy": 0.7803058823529412,
104
+ "eval_loss": 0.4660645127296448,
105
+ "eval_runtime": 6.6511,
106
+ "eval_samples_per_second": 75.175,
107
+ "eval_steps_per_second": 9.472,
108
+ "step": 937
109
+ },
110
+ {
111
+ "epoch": 5.33,
112
+ "learning_rate": 0.0001,
113
+ "loss": 0.1783,
114
+ "step": 1000
115
+ },
116
+ {
117
+ "epoch": 5.87,
118
+ "learning_rate": 0.0001,
119
+ "loss": 0.1681,
120
+ "step": 1100
121
+ },
122
+ {
123
+ "epoch": 6.0,
124
+ "eval_accuracy": 0.7761333333333333,
125
+ "eval_loss": 0.5122596621513367,
126
+ "eval_runtime": 7.6517,
127
+ "eval_samples_per_second": 65.345,
128
+ "eval_steps_per_second": 8.233,
129
+ "step": 1125
130
+ },
131
+ {
132
+ "epoch": 6.4,
133
+ "learning_rate": 0.0001,
134
+ "loss": 0.1437,
135
+ "step": 1200
136
+ },
137
+ {
138
+ "epoch": 6.93,
139
+ "learning_rate": 0.0001,
140
+ "loss": 0.1404,
141
+ "step": 1300
142
+ },
143
+ {
144
+ "epoch": 7.0,
145
+ "eval_accuracy": 0.7720627450980392,
146
+ "eval_loss": 0.5730750560760498,
147
+ "eval_runtime": 7.0342,
148
+ "eval_samples_per_second": 71.081,
149
+ "eval_steps_per_second": 8.956,
150
+ "step": 1312
151
+ },
152
+ {
153
+ "epoch": 7.47,
154
+ "learning_rate": 0.0001,
155
+ "loss": 0.1179,
156
+ "step": 1400
157
+ },
158
+ {
159
+ "epoch": 8.0,
160
+ "learning_rate": 0.0001,
161
+ "loss": 0.1197,
162
+ "step": 1500
163
+ },
164
+ {
165
+ "epoch": 8.0,
166
+ "eval_accuracy": 0.770078431372549,
167
+ "eval_loss": 0.6074877977371216,
168
+ "eval_runtime": 6.9934,
169
+ "eval_samples_per_second": 71.496,
170
+ "eval_steps_per_second": 9.008,
171
+ "step": 1500
172
+ },
173
+ {
174
+ "epoch": 8.53,
175
+ "learning_rate": 0.0001,
176
+ "loss": 0.1,
177
+ "step": 1600
178
+ },
179
+ {
180
+ "epoch": 9.0,
181
+ "eval_accuracy": 0.7688470588235294,
182
+ "eval_loss": 0.631741464138031,
183
+ "eval_runtime": 7.3864,
184
+ "eval_samples_per_second": 67.692,
185
+ "eval_steps_per_second": 8.529,
186
+ "step": 1687
187
+ },
188
+ {
189
+ "epoch": 9.07,
190
+ "learning_rate": 0.0001,
191
+ "loss": 0.1027,
192
+ "step": 1700
193
+ },
194
+ {
195
+ "epoch": 9.6,
196
+ "learning_rate": 0.0001,
197
+ "loss": 0.089,
198
+ "step": 1800
199
+ },
200
+ {
201
+ "epoch": 10.0,
202
+ "eval_accuracy": 0.7663764705882353,
203
+ "eval_loss": 0.6717921495437622,
204
+ "eval_runtime": 6.6214,
205
+ "eval_samples_per_second": 75.513,
206
+ "eval_steps_per_second": 9.515,
207
+ "step": 1875
208
+ },
209
+ {
210
+ "epoch": 10.13,
211
+ "learning_rate": 0.0001,
212
+ "loss": 0.0911,
213
+ "step": 1900
214
+ },
215
+ {
216
+ "epoch": 10.67,
217
+ "learning_rate": 0.0001,
218
+ "loss": 0.0837,
219
+ "step": 2000
220
+ },
221
+ {
222
+ "epoch": 11.0,
223
+ "eval_accuracy": 0.7653333333333333,
224
+ "eval_loss": 0.6921772956848145,
225
+ "eval_runtime": 6.6402,
226
+ "eval_samples_per_second": 75.299,
227
+ "eval_steps_per_second": 9.488,
228
+ "step": 2062
229
+ },
230
+ {
231
+ "epoch": 11.2,
232
+ "learning_rate": 0.0001,
233
+ "loss": 0.0823,
234
+ "step": 2100
235
+ },
236
+ {
237
+ "epoch": 11.73,
238
+ "learning_rate": 0.0001,
239
+ "loss": 0.0788,
240
+ "step": 2200
241
+ },
242
+ {
243
+ "epoch": 12.0,
244
+ "eval_accuracy": 0.7631764705882353,
245
+ "eval_loss": 0.7253576517105103,
246
+ "eval_runtime": 7.6814,
247
+ "eval_samples_per_second": 65.092,
248
+ "eval_steps_per_second": 8.202,
249
+ "step": 2250
250
+ },
251
+ {
252
+ "epoch": 12.27,
253
+ "learning_rate": 0.0001,
254
+ "loss": 0.078,
255
+ "step": 2300
256
+ },
257
+ {
258
+ "epoch": 12.8,
259
+ "learning_rate": 0.0001,
260
+ "loss": 0.0761,
261
+ "step": 2400
262
+ },
263
+ {
264
+ "epoch": 13.0,
265
+ "eval_accuracy": 0.7628627450980392,
266
+ "eval_loss": 0.7256088256835938,
267
+ "eval_runtime": 7.6806,
268
+ "eval_samples_per_second": 65.099,
269
+ "eval_steps_per_second": 8.202,
270
+ "step": 2437
271
+ },
272
+ {
273
+ "epoch": 13.33,
274
+ "learning_rate": 0.0001,
275
+ "loss": 0.0743,
276
+ "step": 2500
277
+ },
278
+ {
279
+ "epoch": 13.87,
280
+ "learning_rate": 0.0001,
281
+ "loss": 0.0749,
282
+ "step": 2600
283
+ },
284
+ {
285
+ "epoch": 14.0,
286
+ "eval_accuracy": 0.7620627450980392,
287
+ "eval_loss": 0.7533740401268005,
288
+ "eval_runtime": 7.644,
289
+ "eval_samples_per_second": 65.41,
290
+ "eval_steps_per_second": 8.242,
291
+ "step": 2625
292
+ },
293
+ {
294
+ "epoch": 14.4,
295
+ "learning_rate": 0.0001,
296
+ "loss": 0.0719,
297
+ "step": 2700
298
+ },
299
+ {
300
+ "epoch": 14.93,
301
+ "learning_rate": 0.0001,
302
+ "loss": 0.0741,
303
+ "step": 2800
304
+ },
305
+ {
306
+ "epoch": 15.0,
307
+ "eval_accuracy": 0.7620078431372549,
308
+ "eval_loss": 0.7529163360595703,
309
+ "eval_runtime": 7.4208,
310
+ "eval_samples_per_second": 67.378,
311
+ "eval_steps_per_second": 8.49,
312
+ "step": 2812
313
+ },
314
+ {
315
+ "epoch": 15.47,
316
+ "learning_rate": 0.0001,
317
+ "loss": 0.0704,
318
+ "step": 2900
319
+ },
320
+ {
321
+ "epoch": 16.0,
322
+ "learning_rate": 0.0001,
323
+ "loss": 0.0726,
324
+ "step": 3000
325
+ },
326
+ {
327
+ "epoch": 16.0,
328
+ "eval_accuracy": 0.7610745098039216,
329
+ "eval_loss": 0.7678206562995911,
330
+ "eval_runtime": 6.6228,
331
+ "eval_samples_per_second": 75.497,
332
+ "eval_steps_per_second": 9.513,
333
+ "step": 3000
334
+ },
335
+ {
336
+ "epoch": 16.53,
337
+ "learning_rate": 0.0001,
338
+ "loss": 0.0687,
339
+ "step": 3100
340
+ },
341
+ {
342
+ "epoch": 17.0,
343
+ "eval_accuracy": 0.7610274509803922,
344
+ "eval_loss": 0.7728469371795654,
345
+ "eval_runtime": 7.7738,
346
+ "eval_samples_per_second": 64.319,
347
+ "eval_steps_per_second": 8.104,
348
+ "step": 3187
349
+ },
350
+ {
351
+ "epoch": 17.07,
352
+ "learning_rate": 0.0001,
353
+ "loss": 0.0706,
354
+ "step": 3200
355
+ },
356
+ {
357
+ "epoch": 17.6,
358
+ "learning_rate": 0.0001,
359
+ "loss": 0.0682,
360
+ "step": 3300
361
+ },
362
+ {
363
+ "epoch": 18.0,
364
+ "eval_accuracy": 0.7603058823529412,
365
+ "eval_loss": 0.7807328104972839,
366
+ "eval_runtime": 7.6425,
367
+ "eval_samples_per_second": 65.424,
368
+ "eval_steps_per_second": 8.243,
369
+ "step": 3375
370
+ },
371
+ {
372
+ "epoch": 18.13,
373
+ "learning_rate": 0.0001,
374
+ "loss": 0.069,
375
+ "step": 3400
376
+ },
377
+ {
378
+ "epoch": 18.67,
379
+ "learning_rate": 0.0001,
380
+ "loss": 0.0682,
381
+ "step": 3500
382
+ },
383
+ {
384
+ "epoch": 19.0,
385
+ "eval_accuracy": 0.7609882352941176,
386
+ "eval_loss": 0.7872016429901123,
387
+ "eval_runtime": 7.0778,
388
+ "eval_samples_per_second": 70.644,
389
+ "eval_steps_per_second": 8.901,
390
+ "step": 3562
391
+ },
392
+ {
393
+ "epoch": 19.2,
394
+ "learning_rate": 0.0001,
395
+ "loss": 0.0682,
396
+ "step": 3600
397
+ },
398
+ {
399
+ "epoch": 19.73,
400
+ "learning_rate": 0.0001,
401
+ "loss": 0.0682,
402
+ "step": 3700
403
+ },
404
+ {
405
+ "epoch": 19.95,
406
+ "eval_accuracy": 0.7597490196078431,
407
+ "eval_loss": 0.8054783940315247,
408
+ "eval_runtime": 7.0912,
409
+ "eval_samples_per_second": 70.51,
410
+ "eval_steps_per_second": 8.884,
411
+ "step": 3740
412
+ },
413
+ {
414
+ "epoch": 19.95,
415
+ "step": 3740,
416
+ "total_flos": 8.667972359988183e+17,
417
+ "train_loss": 0.1472176224152672,
418
+ "train_runtime": 2965.7973,
419
+ "train_samples_per_second": 40.461,
420
+ "train_steps_per_second": 1.261
421
+ }
422
+ ],
423
+ "logging_steps": 100,
424
+ "max_steps": 3740,
425
+ "num_train_epochs": 20,
426
+ "save_steps": 500,
427
+ "total_flos": 8.667972359988183e+17,
428
+ "trial_name": null,
429
+ "trial_params": null
430
+ }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:005e168b1c973939e97051f0f97993fb8f4912d19b4ec4476f4baca9a8332bd8
3
  size 4728
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69229a474ae3be2188b1040a5433091508861cfca3d28c831620e7b24bd5c1d9
3
  size 4728