TheBguy87 commited on
Commit
3b19c3f
1 Parent(s): 6b95104

Upload 14 files

Browse files
config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50256,
8
+ "embd_pdrop": 0.1,
9
+ "eos_token_id": 50256,
10
+ "initializer_range": 0.02,
11
+ "layer_norm_epsilon": 1e-05,
12
+ "model_type": "gpt2",
13
+ "n_embd": 768,
14
+ "n_head": 12,
15
+ "n_inner": null,
16
+ "n_layer": 12,
17
+ "n_positions": 1024,
18
+ "reorder_and_upcast_attn": false,
19
+ "resid_pdrop": 0.1,
20
+ "scale_attn_by_inverse_layer_idx": false,
21
+ "scale_attn_weights": true,
22
+ "summary_activation": null,
23
+ "summary_first_dropout": 0.1,
24
+ "summary_proj_to_labels": true,
25
+ "summary_type": "cls_index",
26
+ "summary_use_proj": true,
27
+ "torch_dtype": "float32",
28
+ "transformers_version": "4.29.0.dev0",
29
+ "use_cache": true,
30
+ "vocab_size": 50257
31
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "transformers_version": "4.29.0.dev0"
6
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb50bb39696a8486a40ca9006ddb25ad091c15a2d0118f4baa91c1050e4d8e73
3
+ size 995604017
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d6e23c8533f24c32a7ed86ca5c04408c30d2ec560a556a6f08df26c052ab7ab
3
+ size 510396521
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f96808c9938a930525715eb2fddd2dd78e438d0ba0a19902f6c7f705dba577d3
3
+ size 14503
scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86807d5de345e31d4942dd66d818a90bc4bb3d823b548bbd617da213dcc32265
3
+ size 559
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a7a7f9f487910b01e20fd10a9578c5221f6c4419e20f86d80e4f497963fc285
3
+ size 623
special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "unk_token": "<|endoftext|>"
5
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": "<|endoftext|>",
4
+ "clean_up_tokenization_spaces": true,
5
+ "eos_token": "<|endoftext|>",
6
+ "model_max_length": 1024,
7
+ "tokenizer_class": "GPT2Tokenizer",
8
+ "unk_token": "<|endoftext|>"
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,601 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 4.17286491394043,
3
+ "best_model_checkpoint": "/home/bel3/portfolio-repo-TheBguy87/final project/model_folder/checkpoint-19500",
4
+ "epoch": 2.9601518026565463,
5
+ "global_step": 19500,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.08,
12
+ "learning_rate": 4.873488183796367e-05,
13
+ "loss": 6.5463,
14
+ "step": 500
15
+ },
16
+ {
17
+ "epoch": 0.08,
18
+ "eval_accuracy": 0.21852576647097194,
19
+ "eval_loss": 5.444242477416992,
20
+ "eval_runtime": 27.5824,
21
+ "eval_samples_per_second": 51.12,
22
+ "eval_steps_per_second": 6.417,
23
+ "step": 500
24
+ },
25
+ {
26
+ "epoch": 0.15,
27
+ "learning_rate": 4.746976367592733e-05,
28
+ "loss": 5.7814,
29
+ "step": 1000
30
+ },
31
+ {
32
+ "epoch": 0.15,
33
+ "eval_accuracy": 0.2316206575897628,
34
+ "eval_loss": 5.191591739654541,
35
+ "eval_runtime": 27.5501,
36
+ "eval_samples_per_second": 51.179,
37
+ "eval_steps_per_second": 6.425,
38
+ "step": 1000
39
+ },
40
+ {
41
+ "epoch": 0.23,
42
+ "learning_rate": 4.6204645513891e-05,
43
+ "loss": 5.5475,
44
+ "step": 1500
45
+ },
46
+ {
47
+ "epoch": 0.23,
48
+ "eval_accuracy": 0.2393984816310669,
49
+ "eval_loss": 5.062835693359375,
50
+ "eval_runtime": 27.5707,
51
+ "eval_samples_per_second": 51.141,
52
+ "eval_steps_per_second": 6.42,
53
+ "step": 1500
54
+ },
55
+ {
56
+ "epoch": 0.3,
57
+ "learning_rate": 4.493952735185467e-05,
58
+ "loss": 5.4202,
59
+ "step": 2000
60
+ },
61
+ {
62
+ "epoch": 0.3,
63
+ "eval_accuracy": 0.24114169130199442,
64
+ "eval_loss": 4.970830917358398,
65
+ "eval_runtime": 27.5621,
66
+ "eval_samples_per_second": 51.157,
67
+ "eval_steps_per_second": 6.422,
68
+ "step": 2000
69
+ },
70
+ {
71
+ "epoch": 0.38,
72
+ "learning_rate": 4.367440918981833e-05,
73
+ "loss": 5.3562,
74
+ "step": 2500
75
+ },
76
+ {
77
+ "epoch": 0.38,
78
+ "eval_accuracy": 0.24209795839058446,
79
+ "eval_loss": 4.920139312744141,
80
+ "eval_runtime": 27.5764,
81
+ "eval_samples_per_second": 51.131,
82
+ "eval_steps_per_second": 6.419,
83
+ "step": 2500
84
+ },
85
+ {
86
+ "epoch": 0.46,
87
+ "learning_rate": 4.240929102778199e-05,
88
+ "loss": 5.2881,
89
+ "step": 3000
90
+ },
91
+ {
92
+ "epoch": 0.46,
93
+ "eval_accuracy": 0.2436371459105356,
94
+ "eval_loss": 4.872959136962891,
95
+ "eval_runtime": 27.5692,
96
+ "eval_samples_per_second": 51.144,
97
+ "eval_steps_per_second": 6.42,
98
+ "step": 3000
99
+ },
100
+ {
101
+ "epoch": 0.53,
102
+ "learning_rate": 4.114417286574566e-05,
103
+ "loss": 5.2296,
104
+ "step": 3500
105
+ },
106
+ {
107
+ "epoch": 0.53,
108
+ "eval_accuracy": 0.2506516217679144,
109
+ "eval_loss": 4.810730457305908,
110
+ "eval_runtime": 27.5767,
111
+ "eval_samples_per_second": 51.13,
112
+ "eval_steps_per_second": 6.418,
113
+ "step": 3500
114
+ },
115
+ {
116
+ "epoch": 0.61,
117
+ "learning_rate": 3.987905470370933e-05,
118
+ "loss": 5.1578,
119
+ "step": 4000
120
+ },
121
+ {
122
+ "epoch": 0.61,
123
+ "eval_accuracy": 0.2533899598895227,
124
+ "eval_loss": 4.767050743103027,
125
+ "eval_runtime": 27.5725,
126
+ "eval_samples_per_second": 51.138,
127
+ "eval_steps_per_second": 6.419,
128
+ "step": 4000
129
+ },
130
+ {
131
+ "epoch": 0.68,
132
+ "learning_rate": 3.8613936541672994e-05,
133
+ "loss": 5.0779,
134
+ "step": 4500
135
+ },
136
+ {
137
+ "epoch": 0.68,
138
+ "eval_accuracy": 0.2565418939362396,
139
+ "eval_loss": 4.725719451904297,
140
+ "eval_runtime": 27.5923,
141
+ "eval_samples_per_second": 51.101,
142
+ "eval_steps_per_second": 6.415,
143
+ "step": 4500
144
+ },
145
+ {
146
+ "epoch": 0.76,
147
+ "learning_rate": 3.734881837963666e-05,
148
+ "loss": 5.0276,
149
+ "step": 5000
150
+ },
151
+ {
152
+ "epoch": 0.76,
153
+ "eval_accuracy": 0.2623502796630165,
154
+ "eval_loss": 4.667919635772705,
155
+ "eval_runtime": 27.5734,
156
+ "eval_samples_per_second": 51.136,
157
+ "eval_steps_per_second": 6.419,
158
+ "step": 5000
159
+ },
160
+ {
161
+ "epoch": 0.83,
162
+ "learning_rate": 3.6083700217600326e-05,
163
+ "loss": 5.0059,
164
+ "step": 5500
165
+ },
166
+ {
167
+ "epoch": 0.83,
168
+ "eval_accuracy": 0.2663904734146646,
169
+ "eval_loss": 4.631860256195068,
170
+ "eval_runtime": 27.5657,
171
+ "eval_samples_per_second": 51.151,
172
+ "eval_steps_per_second": 6.421,
173
+ "step": 5500
174
+ },
175
+ {
176
+ "epoch": 0.91,
177
+ "learning_rate": 3.481858205556399e-05,
178
+ "loss": 4.9264,
179
+ "step": 6000
180
+ },
181
+ {
182
+ "epoch": 0.91,
183
+ "eval_accuracy": 0.2692717658325353,
184
+ "eval_loss": 4.584375858306885,
185
+ "eval_runtime": 27.5605,
186
+ "eval_samples_per_second": 51.16,
187
+ "eval_steps_per_second": 6.422,
188
+ "step": 6000
189
+ },
190
+ {
191
+ "epoch": 0.99,
192
+ "learning_rate": 3.355346389352766e-05,
193
+ "loss": 4.9049,
194
+ "step": 6500
195
+ },
196
+ {
197
+ "epoch": 0.99,
198
+ "eval_accuracy": 0.272461173335554,
199
+ "eval_loss": 4.553061485290527,
200
+ "eval_runtime": 27.5742,
201
+ "eval_samples_per_second": 51.135,
202
+ "eval_steps_per_second": 6.419,
203
+ "step": 6500
204
+ },
205
+ {
206
+ "epoch": 1.06,
207
+ "learning_rate": 3.228834573149132e-05,
208
+ "loss": 4.8232,
209
+ "step": 7000
210
+ },
211
+ {
212
+ "epoch": 1.06,
213
+ "eval_accuracy": 0.27501491998723127,
214
+ "eval_loss": 4.517702579498291,
215
+ "eval_runtime": 27.5623,
216
+ "eval_samples_per_second": 51.157,
217
+ "eval_steps_per_second": 6.422,
218
+ "step": 7000
219
+ },
220
+ {
221
+ "epoch": 1.14,
222
+ "learning_rate": 3.102322756945499e-05,
223
+ "loss": 4.751,
224
+ "step": 7500
225
+ },
226
+ {
227
+ "epoch": 1.14,
228
+ "eval_accuracy": 0.2793854353166507,
229
+ "eval_loss": 4.4822540283203125,
230
+ "eval_runtime": 27.5718,
231
+ "eval_samples_per_second": 51.139,
232
+ "eval_steps_per_second": 6.42,
233
+ "step": 7500
234
+ },
235
+ {
236
+ "epoch": 1.21,
237
+ "learning_rate": 2.9758109407418657e-05,
238
+ "loss": 4.7553,
239
+ "step": 8000
240
+ },
241
+ {
242
+ "epoch": 1.21,
243
+ "eval_accuracy": 0.2807802806345505,
244
+ "eval_loss": 4.459610939025879,
245
+ "eval_runtime": 27.5709,
246
+ "eval_samples_per_second": 51.141,
247
+ "eval_steps_per_second": 6.42,
248
+ "step": 8000
249
+ },
250
+ {
251
+ "epoch": 1.29,
252
+ "learning_rate": 2.8495521481706393e-05,
253
+ "loss": 4.7213,
254
+ "step": 8500
255
+ },
256
+ {
257
+ "epoch": 1.29,
258
+ "eval_accuracy": 0.28297872340425534,
259
+ "eval_loss": 4.435904026031494,
260
+ "eval_runtime": 27.5742,
261
+ "eval_samples_per_second": 51.135,
262
+ "eval_steps_per_second": 6.419,
263
+ "step": 8500
264
+ },
265
+ {
266
+ "epoch": 1.37,
267
+ "learning_rate": 2.7230403319670056e-05,
268
+ "loss": 4.6646,
269
+ "step": 9000
270
+ },
271
+ {
272
+ "epoch": 1.37,
273
+ "eval_accuracy": 0.28641656604349697,
274
+ "eval_loss": 4.399264335632324,
275
+ "eval_runtime": 27.5882,
276
+ "eval_samples_per_second": 51.109,
277
+ "eval_steps_per_second": 6.416,
278
+ "step": 9000
279
+ },
280
+ {
281
+ "epoch": 1.44,
282
+ "learning_rate": 2.597034563028187e-05,
283
+ "loss": 4.6709,
284
+ "step": 9500
285
+ },
286
+ {
287
+ "epoch": 1.44,
288
+ "eval_accuracy": 0.2876531901014559,
289
+ "eval_loss": 4.388927936553955,
290
+ "eval_runtime": 27.5803,
291
+ "eval_samples_per_second": 51.123,
292
+ "eval_steps_per_second": 6.418,
293
+ "step": 9500
294
+ },
295
+ {
296
+ "epoch": 1.52,
297
+ "learning_rate": 2.4705227468245535e-05,
298
+ "loss": 4.6179,
299
+ "step": 10000
300
+ },
301
+ {
302
+ "epoch": 1.52,
303
+ "eval_accuracy": 0.2888079277178665,
304
+ "eval_loss": 4.365257263183594,
305
+ "eval_runtime": 27.58,
306
+ "eval_samples_per_second": 51.124,
307
+ "eval_steps_per_second": 6.418,
308
+ "step": 10000
309
+ },
310
+ {
311
+ "epoch": 1.59,
312
+ "learning_rate": 2.34401093062092e-05,
313
+ "loss": 4.5984,
314
+ "step": 10500
315
+ },
316
+ {
317
+ "epoch": 1.59,
318
+ "eval_accuracy": 0.2886746887621268,
319
+ "eval_loss": 4.34913444519043,
320
+ "eval_runtime": 27.5823,
321
+ "eval_samples_per_second": 51.12,
322
+ "eval_steps_per_second": 6.417,
323
+ "step": 10500
324
+ },
325
+ {
326
+ "epoch": 1.67,
327
+ "learning_rate": 2.2174991144172867e-05,
328
+ "loss": 4.5868,
329
+ "step": 11000
330
+ },
331
+ {
332
+ "epoch": 1.67,
333
+ "eval_accuracy": 0.29093697519812356,
334
+ "eval_loss": 4.338951587677002,
335
+ "eval_runtime": 27.5873,
336
+ "eval_samples_per_second": 51.11,
337
+ "eval_steps_per_second": 6.416,
338
+ "step": 11000
339
+ },
340
+ {
341
+ "epoch": 1.75,
342
+ "learning_rate": 2.0909872982136533e-05,
343
+ "loss": 4.5543,
344
+ "step": 11500
345
+ },
346
+ {
347
+ "epoch": 1.75,
348
+ "eval_accuracy": 0.29370307143551094,
349
+ "eval_loss": 4.3105316162109375,
350
+ "eval_runtime": 27.5819,
351
+ "eval_samples_per_second": 51.12,
352
+ "eval_steps_per_second": 6.417,
353
+ "step": 11500
354
+ },
355
+ {
356
+ "epoch": 1.82,
357
+ "learning_rate": 1.9644754820100196e-05,
358
+ "loss": 4.5493,
359
+ "step": 12000
360
+ },
361
+ {
362
+ "epoch": 1.82,
363
+ "eval_accuracy": 0.29424019097583654,
364
+ "eval_loss": 4.301023483276367,
365
+ "eval_runtime": 27.5881,
366
+ "eval_samples_per_second": 51.109,
367
+ "eval_steps_per_second": 6.416,
368
+ "step": 12000
369
+ },
370
+ {
371
+ "epoch": 1.9,
372
+ "learning_rate": 1.8379636658063865e-05,
373
+ "loss": 4.5038,
374
+ "step": 12500
375
+ },
376
+ {
377
+ "epoch": 1.9,
378
+ "eval_accuracy": 0.294011186520659,
379
+ "eval_loss": 4.281564235687256,
380
+ "eval_runtime": 27.5889,
381
+ "eval_samples_per_second": 51.107,
382
+ "eval_steps_per_second": 6.416,
383
+ "step": 12500
384
+ },
385
+ {
386
+ "epoch": 1.97,
387
+ "learning_rate": 1.7114518496027528e-05,
388
+ "loss": 4.4836,
389
+ "step": 13000
390
+ },
391
+ {
392
+ "epoch": 1.97,
393
+ "eval_accuracy": 0.29663155265020613,
394
+ "eval_loss": 4.268117427825928,
395
+ "eval_runtime": 27.5869,
396
+ "eval_samples_per_second": 51.111,
397
+ "eval_steps_per_second": 6.416,
398
+ "step": 13000
399
+ },
400
+ {
401
+ "epoch": 2.05,
402
+ "learning_rate": 1.5849400333991198e-05,
403
+ "loss": 4.4394,
404
+ "step": 13500
405
+ },
406
+ {
407
+ "epoch": 2.05,
408
+ "eval_accuracy": 0.2960416926899002,
409
+ "eval_loss": 4.262997150421143,
410
+ "eval_runtime": 27.5906,
411
+ "eval_samples_per_second": 51.104,
412
+ "eval_steps_per_second": 6.415,
413
+ "step": 13500
414
+ },
415
+ {
416
+ "epoch": 2.13,
417
+ "learning_rate": 1.458428217195486e-05,
418
+ "loss": 4.4429,
419
+ "step": 14000
420
+ },
421
+ {
422
+ "epoch": 2.13,
423
+ "eval_accuracy": 0.2994989660101872,
424
+ "eval_loss": 4.247574329376221,
425
+ "eval_runtime": 27.5874,
426
+ "eval_samples_per_second": 51.11,
427
+ "eval_steps_per_second": 6.416,
428
+ "step": 14000
429
+ },
430
+ {
431
+ "epoch": 2.2,
432
+ "learning_rate": 1.3319164009918528e-05,
433
+ "loss": 4.4404,
434
+ "step": 14500
435
+ },
436
+ {
437
+ "epoch": 2.2,
438
+ "eval_accuracy": 0.3004260870772092,
439
+ "eval_loss": 4.236292839050293,
440
+ "eval_runtime": 27.9352,
441
+ "eval_samples_per_second": 50.474,
442
+ "eval_steps_per_second": 6.336,
443
+ "step": 14500
444
+ },
445
+ {
446
+ "epoch": 2.28,
447
+ "learning_rate": 1.2054045847882193e-05,
448
+ "loss": 4.4148,
449
+ "step": 15000
450
+ },
451
+ {
452
+ "epoch": 2.28,
453
+ "eval_accuracy": 0.3013920695063219,
454
+ "eval_loss": 4.226977825164795,
455
+ "eval_runtime": 27.9368,
456
+ "eval_samples_per_second": 50.471,
457
+ "eval_steps_per_second": 6.336,
458
+ "step": 15000
459
+ },
460
+ {
461
+ "epoch": 2.35,
462
+ "learning_rate": 1.0791457922169932e-05,
463
+ "loss": 4.3944,
464
+ "step": 15500
465
+ },
466
+ {
467
+ "epoch": 2.35,
468
+ "eval_accuracy": 0.3011366948411542,
469
+ "eval_loss": 4.218454360961914,
470
+ "eval_runtime": 28.163,
471
+ "eval_samples_per_second": 50.066,
472
+ "eval_steps_per_second": 6.285,
473
+ "step": 15500
474
+ },
475
+ {
476
+ "epoch": 2.43,
477
+ "learning_rate": 9.526339760133597e-06,
478
+ "loss": 4.3942,
479
+ "step": 16000
480
+ },
481
+ {
482
+ "epoch": 2.43,
483
+ "eval_accuracy": 0.3020194029229296,
484
+ "eval_loss": 4.213292121887207,
485
+ "eval_runtime": 28.5992,
486
+ "eval_samples_per_second": 49.302,
487
+ "eval_steps_per_second": 6.189,
488
+ "step": 16000
489
+ },
490
+ {
491
+ "epoch": 2.5,
492
+ "learning_rate": 8.261221598097263e-06,
493
+ "loss": 4.3789,
494
+ "step": 16500
495
+ },
496
+ {
497
+ "epoch": 2.5,
498
+ "eval_accuracy": 0.3029826095404644,
499
+ "eval_loss": 4.205790996551514,
500
+ "eval_runtime": 28.126,
501
+ "eval_samples_per_second": 50.131,
502
+ "eval_steps_per_second": 6.293,
503
+ "step": 16500
504
+ },
505
+ {
506
+ "epoch": 2.58,
507
+ "learning_rate": 6.996103436060929e-06,
508
+ "loss": 4.3444,
509
+ "step": 17000
510
+ },
511
+ {
512
+ "epoch": 2.58,
513
+ "eval_accuracy": 0.3037764916517467,
514
+ "eval_loss": 4.199104309082031,
515
+ "eval_runtime": 27.8045,
516
+ "eval_samples_per_second": 50.711,
517
+ "eval_steps_per_second": 6.366,
518
+ "step": 17000
519
+ },
520
+ {
521
+ "epoch": 2.66,
522
+ "learning_rate": 5.733515510348667e-06,
523
+ "loss": 4.3503,
524
+ "step": 17500
525
+ },
526
+ {
527
+ "epoch": 2.66,
528
+ "eval_accuracy": 0.3041026495121511,
529
+ "eval_loss": 4.189632415771484,
530
+ "eval_runtime": 28.554,
531
+ "eval_samples_per_second": 49.38,
532
+ "eval_steps_per_second": 6.199,
533
+ "step": 17500
534
+ },
535
+ {
536
+ "epoch": 2.73,
537
+ "learning_rate": 4.468397348312332e-06,
538
+ "loss": 4.3445,
539
+ "step": 18000
540
+ },
541
+ {
542
+ "epoch": 2.73,
543
+ "eval_accuracy": 0.30501727942707246,
544
+ "eval_loss": 4.184421062469482,
545
+ "eval_runtime": 27.7494,
546
+ "eval_samples_per_second": 50.812,
547
+ "eval_steps_per_second": 6.379,
548
+ "step": 18000
549
+ },
550
+ {
551
+ "epoch": 2.81,
552
+ "learning_rate": 3.2058094226000707e-06,
553
+ "loss": 4.3081,
554
+ "step": 18500
555
+ },
556
+ {
557
+ "epoch": 2.81,
558
+ "eval_accuracy": 0.3053531526279996,
559
+ "eval_loss": 4.178225994110107,
560
+ "eval_runtime": 27.5642,
561
+ "eval_samples_per_second": 51.153,
562
+ "eval_steps_per_second": 6.421,
563
+ "step": 18500
564
+ },
565
+ {
566
+ "epoch": 2.88,
567
+ "learning_rate": 1.9406912605637364e-06,
568
+ "loss": 4.3215,
569
+ "step": 19000
570
+ },
571
+ {
572
+ "epoch": 2.88,
573
+ "eval_accuracy": 0.30591109075515954,
574
+ "eval_loss": 4.176368713378906,
575
+ "eval_runtime": 27.554,
576
+ "eval_samples_per_second": 51.172,
577
+ "eval_steps_per_second": 6.424,
578
+ "step": 19000
579
+ },
580
+ {
581
+ "epoch": 2.96,
582
+ "learning_rate": 6.755730985274025e-07,
583
+ "loss": 4.3413,
584
+ "step": 19500
585
+ },
586
+ {
587
+ "epoch": 2.96,
588
+ "eval_accuracy": 0.30599714091407476,
589
+ "eval_loss": 4.17286491394043,
590
+ "eval_runtime": 27.9537,
591
+ "eval_samples_per_second": 50.441,
592
+ "eval_steps_per_second": 6.332,
593
+ "step": 19500
594
+ }
595
+ ],
596
+ "max_steps": 19761,
597
+ "num_train_epochs": 3,
598
+ "total_flos": 2.0380255911936e+16,
599
+ "trial_name": null,
600
+ "trial_params": null
601
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0524b826d7bcd879ed7d8eab0cc57c477df4c6d4ee1a9576bf2650ce6252d36
3
+ size 3951
vocab.json ADDED
The diff for this file is too large to render. See raw diff