Alexziyu commited on
Commit
264c113
1 Parent(s): 8cd7739
README.md CHANGED
@@ -15,7 +15,12 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  This model is a fine-tuned version of [google/byt5-small](https://huggingface.co/google/byt5-small) on an unknown dataset.
17
  It achieves the following results on the evaluation set:
18
- - Loss: 1.6233
 
 
 
 
 
19
 
20
  ## Model description
21
 
@@ -35,23 +40,12 @@ More information needed
35
 
36
  The following hyperparameters were used during training:
37
  - learning_rate: 5e-05
38
- - train_batch_size: 512
39
- - eval_batch_size: 512
40
  - seed: 42
41
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
42
  - lr_scheduler_type: linear
43
- - num_epochs: 5
44
-
45
- ### Training results
46
-
47
- | Training Loss | Epoch | Step | Validation Loss |
48
- |:-------------:|:-----:|:----:|:---------------:|
49
- | No log | 1.0 | 20 | 1.6871 |
50
- | No log | 2.0 | 40 | 1.6487 |
51
- | No log | 3.0 | 60 | 1.6341 |
52
- | No log | 4.0 | 80 | 1.6265 |
53
- | No log | 5.0 | 100 | 1.6233 |
54
-
55
 
56
  ### Framework versions
57
 
 
15
 
16
  This model is a fine-tuned version of [google/byt5-small](https://huggingface.co/google/byt5-small) on an unknown dataset.
17
  It achieves the following results on the evaluation set:
18
+ - eval_loss: 0.0003
19
+ - eval_runtime: 10.8156
20
+ - eval_samples_per_second: 924.594
21
+ - eval_steps_per_second: 1.202
22
+ - epoch: 51.0
23
+ - step: 1275
24
 
25
  ## Model description
26
 
 
40
 
41
  The following hyperparameters were used during training:
42
  - learning_rate: 5e-05
43
+ - train_batch_size: 800
44
+ - eval_batch_size: 800
45
  - seed: 42
46
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
47
  - lr_scheduler_type: linear
48
+ - num_epochs: 200
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  ### Framework versions
51
 
logs/{events.out.tfevents.1713583532.ip-10-25-205-144.8598.1 → events.out.tfevents.1713584966.ip-10-25-205-144.8598.2} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0606f1fd4f0eb3bb34ab3f703b623aeda9872640ed11a13ebc0b4ff408eb4c09
3
- size 6235
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1622dc30e39ec98a6db41321ae4a2ad655d6460bdcd56135aaacc64a58038e99
3
+ size 4184
logs/events.out.tfevents.1713585132.ip-10-25-205-144.16362.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c66862786a75e24d84e1f7ff6f90f1ed3417efe3ef777f95b76e762d33cbcdc1
3
+ size 4532
logs/events.out.tfevents.1713585198.ip-10-25-205-144.16362.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20786f07317093aa2b445bf6b95739504832b1916ce8a468c33a54ac9f6a2e6a
3
+ size 18642
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cd6f85555e522847c8c8942e262aca1936610e6dd7600aa223cad99dbe45de90
3
  size 1198571496
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fca102473377c7785f2f31f5a10961b290d7b0dc9b1d016d7f634b661502fea9
3
  size 1198571496
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e8b23f2643085930b3f7349662bcb04749807424fe293d7a34fd2c4acb9f8cc0
3
  size 2397245434
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30957abcccff359d558399a21aae87ea1caffa62712a96bf816b14b90779b594
3
  size 2397245434
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7dbc7ace80ecdadeb29c581e017512420cee0602278ec068e2fe06794142a50a
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf53929ac8580a64da717c50cb8e706c6b73ed8e123b0f2ed70a2d025d0be0cf
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fcd60da393ff67ad2f08779377eeaa08189a6bf520f752442788825d9bce1449
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50d8e9838b612e3fef6825e76644fa188469427c8cabb80bf0d1aa1704101d1c
3
  size 1064
trainer_state.json CHANGED
@@ -1,59 +1,439 @@
1
  {
2
- "best_metric": 1.6233268976211548,
3
- "best_model_checkpoint": "AlexWang99/byt5_add/checkpoint-100",
4
- "epoch": 5.0,
5
  "eval_steps": 500,
6
- "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
- "eval_loss": 1.6871496438980103,
14
- "eval_runtime": 12.4033,
15
- "eval_samples_per_second": 806.234,
16
- "eval_steps_per_second": 1.612,
17
- "step": 20
18
  },
19
  {
20
  "epoch": 2.0,
21
- "eval_loss": 1.6486749649047852,
22
- "eval_runtime": 12.4805,
23
- "eval_samples_per_second": 801.251,
24
- "eval_steps_per_second": 1.603,
25
- "step": 40
26
  },
27
  {
28
  "epoch": 3.0,
29
- "eval_loss": 1.6341347694396973,
30
- "eval_runtime": 12.3858,
31
- "eval_samples_per_second": 807.376,
32
- "eval_steps_per_second": 1.615,
33
- "step": 60
34
  },
35
  {
36
  "epoch": 4.0,
37
- "eval_loss": 1.6265267133712769,
38
- "eval_runtime": 12.382,
39
- "eval_samples_per_second": 807.624,
40
- "eval_steps_per_second": 1.615,
41
- "step": 80
42
  },
43
  {
44
  "epoch": 5.0,
45
- "eval_loss": 1.6233268976211548,
46
- "eval_runtime": 12.4067,
47
- "eval_samples_per_second": 806.016,
48
- "eval_steps_per_second": 1.612,
49
- "step": 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  }
51
  ],
52
  "logging_steps": 500,
53
- "max_steps": 100,
54
- "num_train_epochs": 5,
55
  "save_steps": 500,
56
- "total_flos": 1435546214400000.0,
57
  "trial_name": null,
58
  "trial_params": null
59
  }
 
1
  {
2
+ "best_metric": 0.00030898803379386663,
3
+ "best_model_checkpoint": "AlexWang99/byt5_add/checkpoint-1275",
4
+ "epoch": 51.0,
5
  "eval_steps": 500,
6
+ "global_step": 1275,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
+ "eval_loss": 1.747314691543579,
14
+ "eval_runtime": 11.1213,
15
+ "eval_samples_per_second": 899.172,
16
+ "eval_steps_per_second": 1.169,
17
+ "step": 25
18
  },
19
  {
20
  "epoch": 2.0,
21
+ "eval_loss": 1.6477937698364258,
22
+ "eval_runtime": 10.7967,
23
+ "eval_samples_per_second": 926.209,
24
+ "eval_steps_per_second": 1.204,
25
+ "step": 50
26
  },
27
  {
28
  "epoch": 3.0,
29
+ "eval_loss": 1.5999796390533447,
30
+ "eval_runtime": 10.8098,
31
+ "eval_samples_per_second": 925.089,
32
+ "eval_steps_per_second": 1.203,
33
+ "step": 75
34
  },
35
  {
36
  "epoch": 4.0,
37
+ "eval_loss": 1.4885144233703613,
38
+ "eval_runtime": 10.8417,
39
+ "eval_samples_per_second": 922.367,
40
+ "eval_steps_per_second": 1.199,
41
+ "step": 100
42
  },
43
  {
44
  "epoch": 5.0,
45
+ "eval_loss": 1.3953396081924438,
46
+ "eval_runtime": 10.8419,
47
+ "eval_samples_per_second": 922.345,
48
+ "eval_steps_per_second": 1.199,
49
+ "step": 125
50
+ },
51
+ {
52
+ "epoch": 6.0,
53
+ "eval_loss": 1.2306207418441772,
54
+ "eval_runtime": 10.8327,
55
+ "eval_samples_per_second": 923.131,
56
+ "eval_steps_per_second": 1.2,
57
+ "step": 150
58
+ },
59
+ {
60
+ "epoch": 7.0,
61
+ "eval_loss": 1.0172127485275269,
62
+ "eval_runtime": 10.8404,
63
+ "eval_samples_per_second": 922.478,
64
+ "eval_steps_per_second": 1.199,
65
+ "step": 175
66
+ },
67
+ {
68
+ "epoch": 8.0,
69
+ "eval_loss": 0.7508996725082397,
70
+ "eval_runtime": 10.867,
71
+ "eval_samples_per_second": 920.221,
72
+ "eval_steps_per_second": 1.196,
73
+ "step": 200
74
+ },
75
+ {
76
+ "epoch": 9.0,
77
+ "eval_loss": 0.5204245448112488,
78
+ "eval_runtime": 10.837,
79
+ "eval_samples_per_second": 922.761,
80
+ "eval_steps_per_second": 1.2,
81
+ "step": 225
82
+ },
83
+ {
84
+ "epoch": 10.0,
85
+ "eval_loss": 0.3563512861728668,
86
+ "eval_runtime": 11.004,
87
+ "eval_samples_per_second": 908.763,
88
+ "eval_steps_per_second": 1.181,
89
+ "step": 250
90
+ },
91
+ {
92
+ "epoch": 11.0,
93
+ "eval_loss": 0.3062296211719513,
94
+ "eval_runtime": 10.8369,
95
+ "eval_samples_per_second": 922.772,
96
+ "eval_steps_per_second": 1.2,
97
+ "step": 275
98
+ },
99
+ {
100
+ "epoch": 12.0,
101
+ "eval_loss": 0.23057429492473602,
102
+ "eval_runtime": 10.799,
103
+ "eval_samples_per_second": 926.015,
104
+ "eval_steps_per_second": 1.204,
105
+ "step": 300
106
+ },
107
+ {
108
+ "epoch": 13.0,
109
+ "eval_loss": 0.17026692628860474,
110
+ "eval_runtime": 10.8267,
111
+ "eval_samples_per_second": 923.643,
112
+ "eval_steps_per_second": 1.201,
113
+ "step": 325
114
+ },
115
+ {
116
+ "epoch": 14.0,
117
+ "eval_loss": 0.14094401895999908,
118
+ "eval_runtime": 10.8171,
119
+ "eval_samples_per_second": 924.461,
120
+ "eval_steps_per_second": 1.202,
121
+ "step": 350
122
+ },
123
+ {
124
+ "epoch": 15.0,
125
+ "eval_loss": 0.11562483012676239,
126
+ "eval_runtime": 10.8544,
127
+ "eval_samples_per_second": 921.286,
128
+ "eval_steps_per_second": 1.198,
129
+ "step": 375
130
+ },
131
+ {
132
+ "epoch": 16.0,
133
+ "eval_loss": 0.1076672226190567,
134
+ "eval_runtime": 10.7997,
135
+ "eval_samples_per_second": 925.947,
136
+ "eval_steps_per_second": 1.204,
137
+ "step": 400
138
+ },
139
+ {
140
+ "epoch": 17.0,
141
+ "eval_loss": 0.07891710102558136,
142
+ "eval_runtime": 10.8355,
143
+ "eval_samples_per_second": 922.895,
144
+ "eval_steps_per_second": 1.2,
145
+ "step": 425
146
+ },
147
+ {
148
+ "epoch": 18.0,
149
+ "eval_loss": 0.07825633883476257,
150
+ "eval_runtime": 10.9577,
151
+ "eval_samples_per_second": 912.598,
152
+ "eval_steps_per_second": 1.186,
153
+ "step": 450
154
+ },
155
+ {
156
+ "epoch": 19.0,
157
+ "eval_loss": 0.053240709006786346,
158
+ "eval_runtime": 10.8407,
159
+ "eval_samples_per_second": 922.45,
160
+ "eval_steps_per_second": 1.199,
161
+ "step": 475
162
+ },
163
+ {
164
+ "epoch": 20.0,
165
+ "learning_rate": 4.5e-05,
166
+ "loss": 0.8626,
167
+ "step": 500
168
+ },
169
+ {
170
+ "epoch": 20.0,
171
+ "eval_loss": 0.03896724432706833,
172
+ "eval_runtime": 10.8919,
173
+ "eval_samples_per_second": 918.117,
174
+ "eval_steps_per_second": 1.194,
175
+ "step": 500
176
+ },
177
+ {
178
+ "epoch": 21.0,
179
+ "eval_loss": 0.0326126404106617,
180
+ "eval_runtime": 10.9808,
181
+ "eval_samples_per_second": 910.682,
182
+ "eval_steps_per_second": 1.184,
183
+ "step": 525
184
+ },
185
+ {
186
+ "epoch": 22.0,
187
+ "eval_loss": 0.026844095438718796,
188
+ "eval_runtime": 10.8647,
189
+ "eval_samples_per_second": 920.415,
190
+ "eval_steps_per_second": 1.197,
191
+ "step": 550
192
+ },
193
+ {
194
+ "epoch": 23.0,
195
+ "eval_loss": 0.022708676755428314,
196
+ "eval_runtime": 11.1211,
197
+ "eval_samples_per_second": 899.191,
198
+ "eval_steps_per_second": 1.169,
199
+ "step": 575
200
+ },
201
+ {
202
+ "epoch": 24.0,
203
+ "eval_loss": 0.020555635914206505,
204
+ "eval_runtime": 10.8169,
205
+ "eval_samples_per_second": 924.48,
206
+ "eval_steps_per_second": 1.202,
207
+ "step": 600
208
+ },
209
+ {
210
+ "epoch": 25.0,
211
+ "eval_loss": 0.016072452068328857,
212
+ "eval_runtime": 10.8195,
213
+ "eval_samples_per_second": 924.261,
214
+ "eval_steps_per_second": 1.202,
215
+ "step": 625
216
+ },
217
+ {
218
+ "epoch": 26.0,
219
+ "eval_loss": 0.015775442123413086,
220
+ "eval_runtime": 11.0521,
221
+ "eval_samples_per_second": 904.809,
222
+ "eval_steps_per_second": 1.176,
223
+ "step": 650
224
+ },
225
+ {
226
+ "epoch": 27.0,
227
+ "eval_loss": 0.010050756856799126,
228
+ "eval_runtime": 10.96,
229
+ "eval_samples_per_second": 912.407,
230
+ "eval_steps_per_second": 1.186,
231
+ "step": 675
232
+ },
233
+ {
234
+ "epoch": 28.0,
235
+ "eval_loss": 0.009800990112125874,
236
+ "eval_runtime": 10.8085,
237
+ "eval_samples_per_second": 925.196,
238
+ "eval_steps_per_second": 1.203,
239
+ "step": 700
240
+ },
241
+ {
242
+ "epoch": 29.0,
243
+ "eval_loss": 0.0077048842795193195,
244
+ "eval_runtime": 10.9528,
245
+ "eval_samples_per_second": 913.005,
246
+ "eval_steps_per_second": 1.187,
247
+ "step": 725
248
+ },
249
+ {
250
+ "epoch": 30.0,
251
+ "eval_loss": 0.005685885436832905,
252
+ "eval_runtime": 10.9631,
253
+ "eval_samples_per_second": 912.147,
254
+ "eval_steps_per_second": 1.186,
255
+ "step": 750
256
+ },
257
+ {
258
+ "epoch": 31.0,
259
+ "eval_loss": 0.006655455566942692,
260
+ "eval_runtime": 10.8367,
261
+ "eval_samples_per_second": 922.788,
262
+ "eval_steps_per_second": 1.2,
263
+ "step": 775
264
+ },
265
+ {
266
+ "epoch": 32.0,
267
+ "eval_loss": 0.004621443338692188,
268
+ "eval_runtime": 10.8165,
269
+ "eval_samples_per_second": 924.51,
270
+ "eval_steps_per_second": 1.202,
271
+ "step": 800
272
+ },
273
+ {
274
+ "epoch": 33.0,
275
+ "eval_loss": 0.0033882376737892628,
276
+ "eval_runtime": 10.9293,
277
+ "eval_samples_per_second": 914.976,
278
+ "eval_steps_per_second": 1.189,
279
+ "step": 825
280
+ },
281
+ {
282
+ "epoch": 34.0,
283
+ "eval_loss": 0.0038037376943975687,
284
+ "eval_runtime": 10.7973,
285
+ "eval_samples_per_second": 926.155,
286
+ "eval_steps_per_second": 1.204,
287
+ "step": 850
288
+ },
289
+ {
290
+ "epoch": 35.0,
291
+ "eval_loss": 0.003371346276253462,
292
+ "eval_runtime": 10.834,
293
+ "eval_samples_per_second": 923.021,
294
+ "eval_steps_per_second": 1.2,
295
+ "step": 875
296
+ },
297
+ {
298
+ "epoch": 36.0,
299
+ "eval_loss": 0.0024659824557602406,
300
+ "eval_runtime": 10.7902,
301
+ "eval_samples_per_second": 926.766,
302
+ "eval_steps_per_second": 1.205,
303
+ "step": 900
304
+ },
305
+ {
306
+ "epoch": 37.0,
307
+ "eval_loss": 0.0022366114426404238,
308
+ "eval_runtime": 10.8096,
309
+ "eval_samples_per_second": 925.1,
310
+ "eval_steps_per_second": 1.203,
311
+ "step": 925
312
+ },
313
+ {
314
+ "epoch": 38.0,
315
+ "eval_loss": 0.0022026619408279657,
316
+ "eval_runtime": 10.8109,
317
+ "eval_samples_per_second": 924.992,
318
+ "eval_steps_per_second": 1.202,
319
+ "step": 950
320
+ },
321
+ {
322
+ "epoch": 39.0,
323
+ "eval_loss": 0.0024010157212615013,
324
+ "eval_runtime": 11.1034,
325
+ "eval_samples_per_second": 900.623,
326
+ "eval_steps_per_second": 1.171,
327
+ "step": 975
328
+ },
329
+ {
330
+ "epoch": 40.0,
331
+ "learning_rate": 4e-05,
332
+ "loss": 0.0919,
333
+ "step": 1000
334
+ },
335
+ {
336
+ "epoch": 40.0,
337
+ "eval_loss": 0.0013342766324058175,
338
+ "eval_runtime": 10.7511,
339
+ "eval_samples_per_second": 930.139,
340
+ "eval_steps_per_second": 1.209,
341
+ "step": 1000
342
+ },
343
+ {
344
+ "epoch": 41.0,
345
+ "eval_loss": 0.0016493805451318622,
346
+ "eval_runtime": 10.7987,
347
+ "eval_samples_per_second": 926.034,
348
+ "eval_steps_per_second": 1.204,
349
+ "step": 1025
350
+ },
351
+ {
352
+ "epoch": 42.0,
353
+ "eval_loss": 0.001088765449821949,
354
+ "eval_runtime": 10.8106,
355
+ "eval_samples_per_second": 925.017,
356
+ "eval_steps_per_second": 1.203,
357
+ "step": 1050
358
+ },
359
+ {
360
+ "epoch": 43.0,
361
+ "eval_loss": 0.0009081660537049174,
362
+ "eval_runtime": 10.7945,
363
+ "eval_samples_per_second": 926.398,
364
+ "eval_steps_per_second": 1.204,
365
+ "step": 1075
366
+ },
367
+ {
368
+ "epoch": 44.0,
369
+ "eval_loss": 0.0007170450408011675,
370
+ "eval_runtime": 10.9388,
371
+ "eval_samples_per_second": 914.174,
372
+ "eval_steps_per_second": 1.188,
373
+ "step": 1100
374
+ },
375
+ {
376
+ "epoch": 45.0,
377
+ "eval_loss": 0.0006850157515145838,
378
+ "eval_runtime": 10.8231,
379
+ "eval_samples_per_second": 923.948,
380
+ "eval_steps_per_second": 1.201,
381
+ "step": 1125
382
+ },
383
+ {
384
+ "epoch": 46.0,
385
+ "eval_loss": 0.0007588361040689051,
386
+ "eval_runtime": 10.9442,
387
+ "eval_samples_per_second": 913.729,
388
+ "eval_steps_per_second": 1.188,
389
+ "step": 1150
390
+ },
391
+ {
392
+ "epoch": 47.0,
393
+ "eval_loss": 0.0007894792361184955,
394
+ "eval_runtime": 10.9394,
395
+ "eval_samples_per_second": 914.125,
396
+ "eval_steps_per_second": 1.188,
397
+ "step": 1175
398
+ },
399
+ {
400
+ "epoch": 48.0,
401
+ "eval_loss": 0.0004850537225138396,
402
+ "eval_runtime": 10.8141,
403
+ "eval_samples_per_second": 924.722,
404
+ "eval_steps_per_second": 1.202,
405
+ "step": 1200
406
+ },
407
+ {
408
+ "epoch": 49.0,
409
+ "eval_loss": 0.0003986251540482044,
410
+ "eval_runtime": 10.7964,
411
+ "eval_samples_per_second": 926.231,
412
+ "eval_steps_per_second": 1.204,
413
+ "step": 1225
414
+ },
415
+ {
416
+ "epoch": 50.0,
417
+ "eval_loss": 0.0005350292194634676,
418
+ "eval_runtime": 10.9488,
419
+ "eval_samples_per_second": 913.343,
420
+ "eval_steps_per_second": 1.187,
421
+ "step": 1250
422
+ },
423
+ {
424
+ "epoch": 51.0,
425
+ "eval_loss": 0.00030898803379386663,
426
+ "eval_runtime": 10.8156,
427
+ "eval_samples_per_second": 924.594,
428
+ "eval_steps_per_second": 1.202,
429
+ "step": 1275
430
  }
431
  ],
432
  "logging_steps": 500,
433
+ "max_steps": 5000,
434
+ "num_train_epochs": 200,
435
  "save_steps": 500,
436
+ "total_flos": 2.928514277376e+16,
437
  "trial_name": null,
438
  "trial_params": null
439
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3e8b5c9766275bc0bd47f9cd5e0eeb351509bdb9ad29146ec950b20b28980e67
3
  size 4792
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:714811acdbc411e9172b3493c4b9194e7296d94c31ad1d94203d6c55f1d9c32b
3
  size 4792