shnl commited on
Commit
1ffcf85
1 Parent(s): c012f41

'instruction'

Browse files
added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<mask>": 64000
3
+ }
bpe.codes ADDED
The diff for this file is too large to render. See raw diff
 
config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "vinai/bartpho-word-base",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "gelu",
5
+ "architectures": [
6
+ "MBartForConditionalGeneration"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "bos_token_id": 0,
10
+ "classifier_dropout": 0.0,
11
+ "d_model": 768,
12
+ "decoder_attention_heads": 12,
13
+ "decoder_ffn_dim": 3072,
14
+ "decoder_layerdrop": 0.0,
15
+ "decoder_layers": 6,
16
+ "decoder_start_token_id": 2,
17
+ "dropout": 0.1,
18
+ "encoder_attention_heads": 12,
19
+ "encoder_ffn_dim": 3072,
20
+ "encoder_layerdrop": 0.0,
21
+ "encoder_layers": 6,
22
+ "eos_token_id": 2,
23
+ "forced_eos_token_id": 2,
24
+ "gradient_checkpointing": false,
25
+ "init_std": 0.02,
26
+ "is_encoder_decoder": true,
27
+ "max_position_embeddings": 1024,
28
+ "model_type": "mbart",
29
+ "num_hidden_layers": 6,
30
+ "pad_token_id": 1,
31
+ "scale_embedding": false,
32
+ "tokenizer_class": "PhobertTokenizer",
33
+ "torch_dtype": "float32",
34
+ "transformers_version": "4.30.2",
35
+ "use_cache": true,
36
+ "vocab_size": 64001
37
+ }
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "decoder_start_token_id": 2,
5
+ "eos_token_id": 2,
6
+ "forced_eos_token_id": 2,
7
+ "pad_token_id": 1,
8
+ "transformers_version": "4.30.2"
9
+ }
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24673b9c8e0e617f8a638a76a3f64552d24d0a2a3a93bd6941932a56652346f3
3
+ size 1199936197
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8b7481c14ace3faf9e8941683afa2bbbca47d5563202ac634562867b7a9dc67
3
+ size 600236781
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a2f80d31b05869407919f8c3e4048d72f1b48d3fc011014515bda593159c127
3
+ size 14575
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf928355da597edad3b10c86cba6df7361b9e35e5e45cc85d1ee1a4d4fa92559
3
+ size 627
special_tokens_map.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": "<mask>",
6
+ "pad_token": "<pad>",
7
+ "sep_token": "</s>",
8
+ "unk_token": "<unk>"
9
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "clean_up_tokenization_spaces": true,
4
+ "cls_token": "<s>",
5
+ "eos_token": "</s>",
6
+ "mask_token": "<mask>",
7
+ "model_max_length": 1000000000000000019884624838656,
8
+ "pad_token": "<pad>",
9
+ "sep_token": "</s>",
10
+ "tokenizer_class": "PhobertTokenizer",
11
+ "unk_token": "<unk>"
12
+ }
trainer_state.json ADDED
@@ -0,0 +1,618 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 9.974948758824869,
5
+ "global_step": 10950,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.23,
12
+ "learning_rate": 4.553734061930783e-05,
13
+ "loss": 1.3816,
14
+ "step": 250
15
+ },
16
+ {
17
+ "epoch": 0.23,
18
+ "eval_loss": 0.1994573473930359,
19
+ "eval_runtime": 49.3939,
20
+ "eval_samples_per_second": 50.553,
21
+ "eval_steps_per_second": 6.337,
22
+ "step": 250
23
+ },
24
+ {
25
+ "epoch": 0.46,
26
+ "learning_rate": 9.107468123861566e-05,
27
+ "loss": 0.1818,
28
+ "step": 500
29
+ },
30
+ {
31
+ "epoch": 0.46,
32
+ "eval_loss": 0.18140748143196106,
33
+ "eval_runtime": 49.3506,
34
+ "eval_samples_per_second": 50.597,
35
+ "eval_steps_per_second": 6.342,
36
+ "step": 500
37
+ },
38
+ {
39
+ "epoch": 0.68,
40
+ "learning_rate": 9.807120237981e-05,
41
+ "loss": 0.1716,
42
+ "step": 750
43
+ },
44
+ {
45
+ "epoch": 0.68,
46
+ "eval_loss": 0.1712454855442047,
47
+ "eval_runtime": 49.3274,
48
+ "eval_samples_per_second": 50.621,
49
+ "eval_steps_per_second": 6.345,
50
+ "step": 750
51
+ },
52
+ {
53
+ "epoch": 0.91,
54
+ "learning_rate": 9.567220036464831e-05,
55
+ "loss": 0.1501,
56
+ "step": 1000
57
+ },
58
+ {
59
+ "epoch": 0.91,
60
+ "eval_loss": 0.1643093228340149,
61
+ "eval_runtime": 49.3431,
62
+ "eval_samples_per_second": 50.605,
63
+ "eval_steps_per_second": 6.343,
64
+ "step": 1000
65
+ },
66
+ {
67
+ "epoch": 1.14,
68
+ "learning_rate": 9.327319834948663e-05,
69
+ "loss": 0.122,
70
+ "step": 1250
71
+ },
72
+ {
73
+ "epoch": 1.14,
74
+ "eval_loss": 0.16473376750946045,
75
+ "eval_runtime": 49.4257,
76
+ "eval_samples_per_second": 50.52,
77
+ "eval_steps_per_second": 6.333,
78
+ "step": 1250
79
+ },
80
+ {
81
+ "epoch": 1.37,
82
+ "learning_rate": 9.087419633432492e-05,
83
+ "loss": 0.1143,
84
+ "step": 1500
85
+ },
86
+ {
87
+ "epoch": 1.37,
88
+ "eval_loss": 0.15297986567020416,
89
+ "eval_runtime": 49.4063,
90
+ "eval_samples_per_second": 50.54,
91
+ "eval_steps_per_second": 6.335,
92
+ "step": 1500
93
+ },
94
+ {
95
+ "epoch": 1.59,
96
+ "learning_rate": 8.847519431916324e-05,
97
+ "loss": 0.1121,
98
+ "step": 1750
99
+ },
100
+ {
101
+ "epoch": 1.59,
102
+ "eval_loss": 0.16007670760154724,
103
+ "eval_runtime": 49.3369,
104
+ "eval_samples_per_second": 50.611,
105
+ "eval_steps_per_second": 6.344,
106
+ "step": 1750
107
+ },
108
+ {
109
+ "epoch": 1.82,
110
+ "learning_rate": 8.607619230400153e-05,
111
+ "loss": 0.1078,
112
+ "step": 2000
113
+ },
114
+ {
115
+ "epoch": 1.82,
116
+ "eval_loss": 0.14469072222709656,
117
+ "eval_runtime": 49.3439,
118
+ "eval_samples_per_second": 50.604,
119
+ "eval_steps_per_second": 6.343,
120
+ "step": 2000
121
+ },
122
+ {
123
+ "epoch": 2.05,
124
+ "learning_rate": 8.367719028883985e-05,
125
+ "loss": 0.0954,
126
+ "step": 2250
127
+ },
128
+ {
129
+ "epoch": 2.05,
130
+ "eval_loss": 0.14710724353790283,
131
+ "eval_runtime": 49.3536,
132
+ "eval_samples_per_second": 50.594,
133
+ "eval_steps_per_second": 6.342,
134
+ "step": 2250
135
+ },
136
+ {
137
+ "epoch": 2.28,
138
+ "learning_rate": 8.127818827367816e-05,
139
+ "loss": 0.0804,
140
+ "step": 2500
141
+ },
142
+ {
143
+ "epoch": 2.28,
144
+ "eval_loss": 0.13685058057308197,
145
+ "eval_runtime": 49.355,
146
+ "eval_samples_per_second": 50.593,
147
+ "eval_steps_per_second": 6.342,
148
+ "step": 2500
149
+ },
150
+ {
151
+ "epoch": 2.51,
152
+ "learning_rate": 7.887918625851645e-05,
153
+ "loss": 0.08,
154
+ "step": 2750
155
+ },
156
+ {
157
+ "epoch": 2.51,
158
+ "eval_loss": 0.13740722835063934,
159
+ "eval_runtime": 49.3607,
160
+ "eval_samples_per_second": 50.587,
161
+ "eval_steps_per_second": 6.341,
162
+ "step": 2750
163
+ },
164
+ {
165
+ "epoch": 2.73,
166
+ "learning_rate": 7.648018424335477e-05,
167
+ "loss": 0.0769,
168
+ "step": 3000
169
+ },
170
+ {
171
+ "epoch": 2.73,
172
+ "eval_loss": 0.1370161473751068,
173
+ "eval_runtime": 49.3701,
174
+ "eval_samples_per_second": 50.577,
175
+ "eval_steps_per_second": 6.34,
176
+ "step": 3000
177
+ },
178
+ {
179
+ "epoch": 2.96,
180
+ "learning_rate": 7.408118222819308e-05,
181
+ "loss": 0.0782,
182
+ "step": 3250
183
+ },
184
+ {
185
+ "epoch": 2.96,
186
+ "eval_loss": 0.13374929130077362,
187
+ "eval_runtime": 49.4074,
188
+ "eval_samples_per_second": 50.539,
189
+ "eval_steps_per_second": 6.335,
190
+ "step": 3250
191
+ },
192
+ {
193
+ "epoch": 3.19,
194
+ "learning_rate": 7.168218021303138e-05,
195
+ "loss": 0.0591,
196
+ "step": 3500
197
+ },
198
+ {
199
+ "epoch": 3.19,
200
+ "eval_loss": 0.13687731325626373,
201
+ "eval_runtime": 49.344,
202
+ "eval_samples_per_second": 50.604,
203
+ "eval_steps_per_second": 6.343,
204
+ "step": 3500
205
+ },
206
+ {
207
+ "epoch": 3.42,
208
+ "learning_rate": 6.928317819786969e-05,
209
+ "loss": 0.0575,
210
+ "step": 3750
211
+ },
212
+ {
213
+ "epoch": 3.42,
214
+ "eval_loss": 0.13442662358283997,
215
+ "eval_runtime": 49.347,
216
+ "eval_samples_per_second": 50.601,
217
+ "eval_steps_per_second": 6.343,
218
+ "step": 3750
219
+ },
220
+ {
221
+ "epoch": 3.64,
222
+ "learning_rate": 6.6884176182708e-05,
223
+ "loss": 0.0579,
224
+ "step": 4000
225
+ },
226
+ {
227
+ "epoch": 3.64,
228
+ "eval_loss": 0.13463211059570312,
229
+ "eval_runtime": 49.3532,
230
+ "eval_samples_per_second": 50.595,
231
+ "eval_steps_per_second": 6.342,
232
+ "step": 4000
233
+ },
234
+ {
235
+ "epoch": 3.87,
236
+ "learning_rate": 6.44851741675463e-05,
237
+ "loss": 0.0541,
238
+ "step": 4250
239
+ },
240
+ {
241
+ "epoch": 3.87,
242
+ "eval_loss": 0.12762367725372314,
243
+ "eval_runtime": 49.3131,
244
+ "eval_samples_per_second": 50.636,
245
+ "eval_steps_per_second": 6.347,
246
+ "step": 4250
247
+ },
248
+ {
249
+ "epoch": 4.1,
250
+ "learning_rate": 6.208617215238462e-05,
251
+ "loss": 0.0469,
252
+ "step": 4500
253
+ },
254
+ {
255
+ "epoch": 4.1,
256
+ "eval_loss": 0.1321054846048355,
257
+ "eval_runtime": 49.3382,
258
+ "eval_samples_per_second": 50.61,
259
+ "eval_steps_per_second": 6.344,
260
+ "step": 4500
261
+ },
262
+ {
263
+ "epoch": 4.33,
264
+ "learning_rate": 5.968717013722291e-05,
265
+ "loss": 0.0367,
266
+ "step": 4750
267
+ },
268
+ {
269
+ "epoch": 4.33,
270
+ "eval_loss": 0.13329076766967773,
271
+ "eval_runtime": 49.3411,
272
+ "eval_samples_per_second": 50.607,
273
+ "eval_steps_per_second": 6.344,
274
+ "step": 4750
275
+ },
276
+ {
277
+ "epoch": 4.55,
278
+ "learning_rate": 5.7288168122061226e-05,
279
+ "loss": 0.0409,
280
+ "step": 5000
281
+ },
282
+ {
283
+ "epoch": 4.55,
284
+ "eval_loss": 0.13458645343780518,
285
+ "eval_runtime": 49.3393,
286
+ "eval_samples_per_second": 50.609,
287
+ "eval_steps_per_second": 6.344,
288
+ "step": 5000
289
+ },
290
+ {
291
+ "epoch": 4.78,
292
+ "learning_rate": 5.488916610689954e-05,
293
+ "loss": 0.0402,
294
+ "step": 5250
295
+ },
296
+ {
297
+ "epoch": 4.78,
298
+ "eval_loss": 0.12923233211040497,
299
+ "eval_runtime": 49.4868,
300
+ "eval_samples_per_second": 50.458,
301
+ "eval_steps_per_second": 6.325,
302
+ "step": 5250
303
+ },
304
+ {
305
+ "epoch": 5.01,
306
+ "learning_rate": 5.249016409173784e-05,
307
+ "loss": 0.0378,
308
+ "step": 5500
309
+ },
310
+ {
311
+ "epoch": 5.01,
312
+ "eval_loss": 0.12460647523403168,
313
+ "eval_runtime": 49.3719,
314
+ "eval_samples_per_second": 50.575,
315
+ "eval_steps_per_second": 6.34,
316
+ "step": 5500
317
+ },
318
+ {
319
+ "epoch": 5.24,
320
+ "learning_rate": 5.009116207657615e-05,
321
+ "loss": 0.0258,
322
+ "step": 5750
323
+ },
324
+ {
325
+ "epoch": 5.24,
326
+ "eval_loss": 0.1305789053440094,
327
+ "eval_runtime": 49.3929,
328
+ "eval_samples_per_second": 50.554,
329
+ "eval_steps_per_second": 6.337,
330
+ "step": 5750
331
+ },
332
+ {
333
+ "epoch": 5.47,
334
+ "learning_rate": 4.769216006141446e-05,
335
+ "loss": 0.0252,
336
+ "step": 6000
337
+ },
338
+ {
339
+ "epoch": 5.47,
340
+ "eval_loss": 0.13075487315654755,
341
+ "eval_runtime": 49.3793,
342
+ "eval_samples_per_second": 50.568,
343
+ "eval_steps_per_second": 6.339,
344
+ "step": 6000
345
+ },
346
+ {
347
+ "epoch": 5.69,
348
+ "learning_rate": 4.5293158046252756e-05,
349
+ "loss": 0.0266,
350
+ "step": 6250
351
+ },
352
+ {
353
+ "epoch": 5.69,
354
+ "eval_loss": 0.13152019679546356,
355
+ "eval_runtime": 49.3918,
356
+ "eval_samples_per_second": 50.555,
357
+ "eval_steps_per_second": 6.337,
358
+ "step": 6250
359
+ },
360
+ {
361
+ "epoch": 5.92,
362
+ "learning_rate": 4.289415603109107e-05,
363
+ "loss": 0.0264,
364
+ "step": 6500
365
+ },
366
+ {
367
+ "epoch": 5.92,
368
+ "eval_loss": 0.12978705763816833,
369
+ "eval_runtime": 49.4158,
370
+ "eval_samples_per_second": 50.53,
371
+ "eval_steps_per_second": 6.334,
372
+ "step": 6500
373
+ },
374
+ {
375
+ "epoch": 6.15,
376
+ "learning_rate": 4.0495154015929375e-05,
377
+ "loss": 0.0204,
378
+ "step": 6750
379
+ },
380
+ {
381
+ "epoch": 6.15,
382
+ "eval_loss": 0.1330789029598236,
383
+ "eval_runtime": 49.4096,
384
+ "eval_samples_per_second": 50.537,
385
+ "eval_steps_per_second": 6.335,
386
+ "step": 6750
387
+ },
388
+ {
389
+ "epoch": 6.38,
390
+ "learning_rate": 3.809615200076768e-05,
391
+ "loss": 0.0176,
392
+ "step": 7000
393
+ },
394
+ {
395
+ "epoch": 6.38,
396
+ "eval_loss": 0.13327623903751373,
397
+ "eval_runtime": 49.4281,
398
+ "eval_samples_per_second": 50.518,
399
+ "eval_steps_per_second": 6.332,
400
+ "step": 7000
401
+ },
402
+ {
403
+ "epoch": 6.6,
404
+ "learning_rate": 3.569714998560599e-05,
405
+ "loss": 0.0177,
406
+ "step": 7250
407
+ },
408
+ {
409
+ "epoch": 6.6,
410
+ "eval_loss": 0.13123974204063416,
411
+ "eval_runtime": 49.4644,
412
+ "eval_samples_per_second": 50.481,
413
+ "eval_steps_per_second": 6.328,
414
+ "step": 7250
415
+ },
416
+ {
417
+ "epoch": 6.83,
418
+ "learning_rate": 3.32981479704443e-05,
419
+ "loss": 0.0161,
420
+ "step": 7500
421
+ },
422
+ {
423
+ "epoch": 6.83,
424
+ "eval_loss": 0.1328614354133606,
425
+ "eval_runtime": 49.4499,
426
+ "eval_samples_per_second": 50.496,
427
+ "eval_steps_per_second": 6.33,
428
+ "step": 7500
429
+ },
430
+ {
431
+ "epoch": 7.06,
432
+ "learning_rate": 3.0899145955282606e-05,
433
+ "loss": 0.016,
434
+ "step": 7750
435
+ },
436
+ {
437
+ "epoch": 7.06,
438
+ "eval_loss": 0.13026753067970276,
439
+ "eval_runtime": 49.4661,
440
+ "eval_samples_per_second": 50.479,
441
+ "eval_steps_per_second": 6.328,
442
+ "step": 7750
443
+ },
444
+ {
445
+ "epoch": 7.29,
446
+ "learning_rate": 2.850014394012091e-05,
447
+ "loss": 0.0104,
448
+ "step": 8000
449
+ },
450
+ {
451
+ "epoch": 7.29,
452
+ "eval_loss": 0.13250969350337982,
453
+ "eval_runtime": 49.5109,
454
+ "eval_samples_per_second": 50.433,
455
+ "eval_steps_per_second": 6.322,
456
+ "step": 8000
457
+ },
458
+ {
459
+ "epoch": 7.52,
460
+ "learning_rate": 2.6101141924959215e-05,
461
+ "loss": 0.0104,
462
+ "step": 8250
463
+ },
464
+ {
465
+ "epoch": 7.52,
466
+ "eval_loss": 0.1344473958015442,
467
+ "eval_runtime": 49.4545,
468
+ "eval_samples_per_second": 50.491,
469
+ "eval_steps_per_second": 6.329,
470
+ "step": 8250
471
+ },
472
+ {
473
+ "epoch": 7.74,
474
+ "learning_rate": 2.3702139909797524e-05,
475
+ "loss": 0.0107,
476
+ "step": 8500
477
+ },
478
+ {
479
+ "epoch": 7.74,
480
+ "eval_loss": 0.13361412286758423,
481
+ "eval_runtime": 49.4611,
482
+ "eval_samples_per_second": 50.484,
483
+ "eval_steps_per_second": 6.328,
484
+ "step": 8500
485
+ },
486
+ {
487
+ "epoch": 7.97,
488
+ "learning_rate": 2.1303137894635834e-05,
489
+ "loss": 0.0105,
490
+ "step": 8750
491
+ },
492
+ {
493
+ "epoch": 7.97,
494
+ "eval_loss": 0.1311049610376358,
495
+ "eval_runtime": 49.3899,
496
+ "eval_samples_per_second": 50.557,
497
+ "eval_steps_per_second": 6.337,
498
+ "step": 8750
499
+ },
500
+ {
501
+ "epoch": 8.2,
502
+ "learning_rate": 1.890413587947414e-05,
503
+ "loss": 0.0072,
504
+ "step": 9000
505
+ },
506
+ {
507
+ "epoch": 8.2,
508
+ "eval_loss": 0.1345677375793457,
509
+ "eval_runtime": 49.4945,
510
+ "eval_samples_per_second": 50.45,
511
+ "eval_steps_per_second": 6.324,
512
+ "step": 9000
513
+ },
514
+ {
515
+ "epoch": 8.43,
516
+ "learning_rate": 1.6505133864312446e-05,
517
+ "loss": 0.0065,
518
+ "step": 9250
519
+ },
520
+ {
521
+ "epoch": 8.43,
522
+ "eval_loss": 0.13423801958560944,
523
+ "eval_runtime": 49.4363,
524
+ "eval_samples_per_second": 50.509,
525
+ "eval_steps_per_second": 6.331,
526
+ "step": 9250
527
+ },
528
+ {
529
+ "epoch": 8.65,
530
+ "learning_rate": 1.4106131849150753e-05,
531
+ "loss": 0.0062,
532
+ "step": 9500
533
+ },
534
+ {
535
+ "epoch": 8.65,
536
+ "eval_loss": 0.13279776275157928,
537
+ "eval_runtime": 49.5198,
538
+ "eval_samples_per_second": 50.424,
539
+ "eval_steps_per_second": 6.321,
540
+ "step": 9500
541
+ },
542
+ {
543
+ "epoch": 8.88,
544
+ "learning_rate": 1.1707129833989061e-05,
545
+ "loss": 0.006,
546
+ "step": 9750
547
+ },
548
+ {
549
+ "epoch": 8.88,
550
+ "eval_loss": 0.13258913159370422,
551
+ "eval_runtime": 49.511,
552
+ "eval_samples_per_second": 50.433,
553
+ "eval_steps_per_second": 6.322,
554
+ "step": 9750
555
+ },
556
+ {
557
+ "epoch": 9.11,
558
+ "learning_rate": 9.308127818827369e-06,
559
+ "loss": 0.0052,
560
+ "step": 10000
561
+ },
562
+ {
563
+ "epoch": 9.11,
564
+ "eval_loss": 0.13228829205036163,
565
+ "eval_runtime": 49.4456,
566
+ "eval_samples_per_second": 50.5,
567
+ "eval_steps_per_second": 6.33,
568
+ "step": 10000
569
+ },
570
+ {
571
+ "epoch": 9.34,
572
+ "learning_rate": 6.909125803665675e-06,
573
+ "loss": 0.0039,
574
+ "step": 10250
575
+ },
576
+ {
577
+ "epoch": 9.34,
578
+ "eval_loss": 0.13294672966003418,
579
+ "eval_runtime": 49.4619,
580
+ "eval_samples_per_second": 50.483,
581
+ "eval_steps_per_second": 6.328,
582
+ "step": 10250
583
+ },
584
+ {
585
+ "epoch": 9.57,
586
+ "learning_rate": 4.510123788503983e-06,
587
+ "loss": 0.0039,
588
+ "step": 10500
589
+ },
590
+ {
591
+ "epoch": 9.57,
592
+ "eval_loss": 0.13272705674171448,
593
+ "eval_runtime": 49.4568,
594
+ "eval_samples_per_second": 50.489,
595
+ "eval_steps_per_second": 6.329,
596
+ "step": 10500
597
+ },
598
+ {
599
+ "epoch": 9.79,
600
+ "learning_rate": 2.11112177334229e-06,
601
+ "loss": 0.004,
602
+ "step": 10750
603
+ },
604
+ {
605
+ "epoch": 9.79,
606
+ "eval_loss": 0.13220719993114471,
607
+ "eval_runtime": 49.465,
608
+ "eval_samples_per_second": 50.48,
609
+ "eval_steps_per_second": 6.328,
610
+ "step": 10750
611
+ }
612
+ ],
613
+ "max_steps": 10970,
614
+ "num_train_epochs": 10,
615
+ "total_flos": 8.710540889772442e+16,
616
+ "trial_name": null,
617
+ "trial_params": null
618
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3efa9b12ba9d3638df5286a4ecf173371d214385a5a7c0b228d6521a49c54c2e
3
+ size 4027
vocab.txt ADDED
The diff for this file is too large to render. See raw diff