Aspik101 commited on
Commit
15d7bf2
1 Parent(s): 240993c

Hubert commit

Browse files
config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "sdadas/polish-gpt2-small",
3
+ "activation_function": "gelu_fast",
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 0,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 2,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_embd": 768,
15
+ "n_head": 12,
16
+ "n_inner": 3072,
17
+ "n_layer": 12,
18
+ "n_positions": 2048,
19
+ "reorder_and_upcast_attn": false,
20
+ "resid_pdrop": 0.1,
21
+ "scale_attn_by_inverse_layer_idx": false,
22
+ "scale_attn_weights": true,
23
+ "summary_activation": null,
24
+ "summary_first_dropout": 0.1,
25
+ "summary_proj_to_labels": true,
26
+ "summary_type": "cls_index",
27
+ "summary_use_proj": true,
28
+ "tokenizer_class": "GPT2TokenizerFast",
29
+ "torch_dtype": "float32",
30
+ "transformers_version": "4.27.0.dev0",
31
+ "use_cache": true,
32
+ "vocab_size": 51200
33
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.27.0.dev0"
6
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f710ee421e9692d4657fc57c31c9abe2564100cc08e791e2273ea79e6c255966
3
+ size 1007690437
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44172502dc260ed58b7c5bdca63f4d1e2b4ae3329d205a511cf410db81808542
3
+ size 554189373
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:840e41187a9efdb8fb051986823a8130f419ca1fc19d194e0ed3575236246701
3
+ size 14575
scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45e4df22341441d04078822b3eea276fc839579c2847943df3b1cd2dec611f9d
3
+ size 557
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33c0f46ebd623bad71c10c220553997e329d77909df57f614e50893b392a627b
3
+ size 627
special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "unk_token": "<unk>"
5
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "bos_token": "<s>",
5
+ "eos_token": "</s>",
6
+ "errors": "replace",
7
+ "model_max_length": 1000000000000000019884624838656,
8
+ "special_tokens_map_file": "original/polish-gpt2-small/special_tokens_map.json",
9
+ "tokenizer_class": "GPT2Tokenizer",
10
+ "unk_token": "<unk>"
11
+ }
trainer_state.json ADDED
@@ -0,0 +1,416 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 20.0,
5
+ "global_step": 10000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.5,
12
+ "learning_rate": 0.0002954545454545454,
13
+ "loss": 0.71,
14
+ "step": 250
15
+ },
16
+ {
17
+ "epoch": 1.0,
18
+ "learning_rate": 0.00028787878787878786,
19
+ "loss": 0.5325,
20
+ "step": 500
21
+ },
22
+ {
23
+ "epoch": 1.0,
24
+ "eval_loss": 0.5033329725265503,
25
+ "eval_runtime": 9.0003,
26
+ "eval_samples_per_second": 888.861,
27
+ "eval_steps_per_second": 27.777,
28
+ "step": 500
29
+ },
30
+ {
31
+ "epoch": 1.5,
32
+ "learning_rate": 0.0002803030303030303,
33
+ "loss": 0.4322,
34
+ "step": 750
35
+ },
36
+ {
37
+ "epoch": 2.0,
38
+ "learning_rate": 0.0002727272727272727,
39
+ "loss": 0.441,
40
+ "step": 1000
41
+ },
42
+ {
43
+ "epoch": 2.0,
44
+ "eval_loss": 0.4973089098930359,
45
+ "eval_runtime": 9.0771,
46
+ "eval_samples_per_second": 881.341,
47
+ "eval_steps_per_second": 27.542,
48
+ "step": 1000
49
+ },
50
+ {
51
+ "epoch": 2.5,
52
+ "learning_rate": 0.0002651515151515151,
53
+ "loss": 0.352,
54
+ "step": 1250
55
+ },
56
+ {
57
+ "epoch": 3.0,
58
+ "learning_rate": 0.00025757575757575756,
59
+ "loss": 0.372,
60
+ "step": 1500
61
+ },
62
+ {
63
+ "epoch": 3.0,
64
+ "eval_loss": 0.5114700198173523,
65
+ "eval_runtime": 9.3118,
66
+ "eval_samples_per_second": 859.12,
67
+ "eval_steps_per_second": 26.848,
68
+ "step": 1500
69
+ },
70
+ {
71
+ "epoch": 3.5,
72
+ "learning_rate": 0.00025,
73
+ "loss": 0.2921,
74
+ "step": 1750
75
+ },
76
+ {
77
+ "epoch": 4.0,
78
+ "learning_rate": 0.0002424242424242424,
79
+ "loss": 0.3107,
80
+ "step": 2000
81
+ },
82
+ {
83
+ "epoch": 4.0,
84
+ "eval_loss": 0.5385135412216187,
85
+ "eval_runtime": 9.7573,
86
+ "eval_samples_per_second": 819.896,
87
+ "eval_steps_per_second": 25.622,
88
+ "step": 2000
89
+ },
90
+ {
91
+ "epoch": 4.5,
92
+ "learning_rate": 0.00023484848484848483,
93
+ "loss": 0.2439,
94
+ "step": 2250
95
+ },
96
+ {
97
+ "epoch": 5.0,
98
+ "learning_rate": 0.00022727272727272725,
99
+ "loss": 0.2619,
100
+ "step": 2500
101
+ },
102
+ {
103
+ "epoch": 5.0,
104
+ "eval_loss": 0.5607308149337769,
105
+ "eval_runtime": 9.0142,
106
+ "eval_samples_per_second": 887.492,
107
+ "eval_steps_per_second": 27.734,
108
+ "step": 2500
109
+ },
110
+ {
111
+ "epoch": 5.5,
112
+ "learning_rate": 0.00021969696969696969,
113
+ "loss": 0.2105,
114
+ "step": 2750
115
+ },
116
+ {
117
+ "epoch": 6.0,
118
+ "learning_rate": 0.0002121212121212121,
119
+ "loss": 0.2257,
120
+ "step": 3000
121
+ },
122
+ {
123
+ "epoch": 6.0,
124
+ "eval_loss": 0.5820582509040833,
125
+ "eval_runtime": 22.0785,
126
+ "eval_samples_per_second": 362.343,
127
+ "eval_steps_per_second": 11.323,
128
+ "step": 3000
129
+ },
130
+ {
131
+ "epoch": 6.5,
132
+ "learning_rate": 0.0002045454545454545,
133
+ "loss": 0.1812,
134
+ "step": 3250
135
+ },
136
+ {
137
+ "epoch": 7.0,
138
+ "learning_rate": 0.00019696969696969695,
139
+ "loss": 0.1947,
140
+ "step": 3500
141
+ },
142
+ {
143
+ "epoch": 7.0,
144
+ "eval_loss": 0.6063645482063293,
145
+ "eval_runtime": 22.1706,
146
+ "eval_samples_per_second": 360.838,
147
+ "eval_steps_per_second": 11.276,
148
+ "step": 3500
149
+ },
150
+ {
151
+ "epoch": 7.5,
152
+ "learning_rate": 0.00018939393939393937,
153
+ "loss": 0.1597,
154
+ "step": 3750
155
+ },
156
+ {
157
+ "epoch": 8.0,
158
+ "learning_rate": 0.0001818181818181818,
159
+ "loss": 0.1715,
160
+ "step": 4000
161
+ },
162
+ {
163
+ "epoch": 8.0,
164
+ "eval_loss": 0.6192964911460876,
165
+ "eval_runtime": 22.493,
166
+ "eval_samples_per_second": 355.666,
167
+ "eval_steps_per_second": 11.115,
168
+ "step": 4000
169
+ },
170
+ {
171
+ "epoch": 8.5,
172
+ "learning_rate": 0.00017424242424242422,
173
+ "loss": 0.1439,
174
+ "step": 4250
175
+ },
176
+ {
177
+ "epoch": 9.0,
178
+ "learning_rate": 0.00016666666666666666,
179
+ "loss": 0.1535,
180
+ "step": 4500
181
+ },
182
+ {
183
+ "epoch": 9.0,
184
+ "eval_loss": 0.6339118480682373,
185
+ "eval_runtime": 17.7948,
186
+ "eval_samples_per_second": 449.57,
187
+ "eval_steps_per_second": 14.049,
188
+ "step": 4500
189
+ },
190
+ {
191
+ "epoch": 9.5,
192
+ "learning_rate": 0.00015909090909090907,
193
+ "loss": 0.1322,
194
+ "step": 4750
195
+ },
196
+ {
197
+ "epoch": 10.0,
198
+ "learning_rate": 0.00015151515151515152,
199
+ "loss": 0.1403,
200
+ "step": 5000
201
+ },
202
+ {
203
+ "epoch": 10.0,
204
+ "eval_loss": 0.648122251033783,
205
+ "eval_runtime": 22.0949,
206
+ "eval_samples_per_second": 362.074,
207
+ "eval_steps_per_second": 11.315,
208
+ "step": 5000
209
+ },
210
+ {
211
+ "epoch": 10.5,
212
+ "learning_rate": 0.00014393939393939393,
213
+ "loss": 0.1229,
214
+ "step": 5250
215
+ },
216
+ {
217
+ "epoch": 11.0,
218
+ "learning_rate": 0.00013636363636363634,
219
+ "loss": 0.1297,
220
+ "step": 5500
221
+ },
222
+ {
223
+ "epoch": 11.0,
224
+ "eval_loss": 0.6578399538993835,
225
+ "eval_runtime": 22.4161,
226
+ "eval_samples_per_second": 356.887,
227
+ "eval_steps_per_second": 11.153,
228
+ "step": 5500
229
+ },
230
+ {
231
+ "epoch": 11.5,
232
+ "learning_rate": 0.00012878787878787878,
233
+ "loss": 0.1158,
234
+ "step": 5750
235
+ },
236
+ {
237
+ "epoch": 12.0,
238
+ "learning_rate": 0.0001212121212121212,
239
+ "loss": 0.1217,
240
+ "step": 6000
241
+ },
242
+ {
243
+ "epoch": 12.0,
244
+ "eval_loss": 0.6666226983070374,
245
+ "eval_runtime": 17.8264,
246
+ "eval_samples_per_second": 448.773,
247
+ "eval_steps_per_second": 14.024,
248
+ "step": 6000
249
+ },
250
+ {
251
+ "epoch": 12.5,
252
+ "learning_rate": 0.00011363636363636362,
253
+ "loss": 0.1106,
254
+ "step": 6250
255
+ },
256
+ {
257
+ "epoch": 13.0,
258
+ "learning_rate": 0.00010606060606060605,
259
+ "loss": 0.1152,
260
+ "step": 6500
261
+ },
262
+ {
263
+ "epoch": 13.0,
264
+ "eval_loss": 0.6757141351699829,
265
+ "eval_runtime": 15.3201,
266
+ "eval_samples_per_second": 522.191,
267
+ "eval_steps_per_second": 16.318,
268
+ "step": 6500
269
+ },
270
+ {
271
+ "epoch": 13.5,
272
+ "learning_rate": 9.848484848484848e-05,
273
+ "loss": 0.1061,
274
+ "step": 6750
275
+ },
276
+ {
277
+ "epoch": 14.0,
278
+ "learning_rate": 9.09090909090909e-05,
279
+ "loss": 0.1101,
280
+ "step": 7000
281
+ },
282
+ {
283
+ "epoch": 14.0,
284
+ "eval_loss": 0.687734842300415,
285
+ "eval_runtime": 21.8339,
286
+ "eval_samples_per_second": 366.402,
287
+ "eval_steps_per_second": 11.45,
288
+ "step": 7000
289
+ },
290
+ {
291
+ "epoch": 14.5,
292
+ "learning_rate": 8.333333333333333e-05,
293
+ "loss": 0.1024,
294
+ "step": 7250
295
+ },
296
+ {
297
+ "epoch": 15.0,
298
+ "learning_rate": 7.575757575757576e-05,
299
+ "loss": 0.1059,
300
+ "step": 7500
301
+ },
302
+ {
303
+ "epoch": 15.0,
304
+ "eval_loss": 0.6898844242095947,
305
+ "eval_runtime": 22.4671,
306
+ "eval_samples_per_second": 356.076,
307
+ "eval_steps_per_second": 11.127,
308
+ "step": 7500
309
+ },
310
+ {
311
+ "epoch": 15.5,
312
+ "learning_rate": 6.818181818181817e-05,
313
+ "loss": 0.0991,
314
+ "step": 7750
315
+ },
316
+ {
317
+ "epoch": 16.0,
318
+ "learning_rate": 6.06060606060606e-05,
319
+ "loss": 0.1022,
320
+ "step": 8000
321
+ },
322
+ {
323
+ "epoch": 16.0,
324
+ "eval_loss": 0.6962941884994507,
325
+ "eval_runtime": 17.8764,
326
+ "eval_samples_per_second": 447.518,
327
+ "eval_steps_per_second": 13.985,
328
+ "step": 8000
329
+ },
330
+ {
331
+ "epoch": 16.5,
332
+ "learning_rate": 5.3030303030303025e-05,
333
+ "loss": 0.0963,
334
+ "step": 8250
335
+ },
336
+ {
337
+ "epoch": 17.0,
338
+ "learning_rate": 4.545454545454545e-05,
339
+ "loss": 0.0991,
340
+ "step": 8500
341
+ },
342
+ {
343
+ "epoch": 17.0,
344
+ "eval_loss": 0.702376127243042,
345
+ "eval_runtime": 21.7707,
346
+ "eval_samples_per_second": 367.466,
347
+ "eval_steps_per_second": 11.483,
348
+ "step": 8500
349
+ },
350
+ {
351
+ "epoch": 17.5,
352
+ "learning_rate": 3.790909090909091e-05,
353
+ "loss": 0.0939,
354
+ "step": 8750
355
+ },
356
+ {
357
+ "epoch": 18.0,
358
+ "learning_rate": 3.033333333333333e-05,
359
+ "loss": 0.0961,
360
+ "step": 9000
361
+ },
362
+ {
363
+ "epoch": 18.0,
364
+ "eval_loss": 0.7053780555725098,
365
+ "eval_runtime": 22.1712,
366
+ "eval_samples_per_second": 360.829,
367
+ "eval_steps_per_second": 11.276,
368
+ "step": 9000
369
+ },
370
+ {
371
+ "epoch": 18.5,
372
+ "learning_rate": 2.2757575757575757e-05,
373
+ "loss": 0.0914,
374
+ "step": 9250
375
+ },
376
+ {
377
+ "epoch": 19.0,
378
+ "learning_rate": 1.518181818181818e-05,
379
+ "loss": 0.0933,
380
+ "step": 9500
381
+ },
382
+ {
383
+ "epoch": 19.0,
384
+ "eval_loss": 0.7104560732841492,
385
+ "eval_runtime": 22.4548,
386
+ "eval_samples_per_second": 356.271,
387
+ "eval_steps_per_second": 11.133,
388
+ "step": 9500
389
+ },
390
+ {
391
+ "epoch": 19.5,
392
+ "learning_rate": 7.6060606060606056e-06,
393
+ "loss": 0.0893,
394
+ "step": 9750
395
+ },
396
+ {
397
+ "epoch": 20.0,
398
+ "learning_rate": 3.03030303030303e-08,
399
+ "loss": 0.0901,
400
+ "step": 10000
401
+ },
402
+ {
403
+ "epoch": 20.0,
404
+ "eval_loss": 0.7170030474662781,
405
+ "eval_runtime": 17.8822,
406
+ "eval_samples_per_second": 447.372,
407
+ "eval_steps_per_second": 13.98,
408
+ "step": 10000
409
+ }
410
+ ],
411
+ "max_steps": 10000,
412
+ "num_train_epochs": 20,
413
+ "total_flos": 4.180672512e+16,
414
+ "trial_name": null,
415
+ "trial_params": null
416
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12ad788ada8ce49fee2a1542b25bae9934d7b25006de0ce420d5f4ca7cd80ef5
3
+ size 3579
vocab.json ADDED
The diff for this file is too large to render. See raw diff