Ko commited on
Commit
4b2e733
1 Parent(s): 444a401

first commit

Browse files
config.json ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "metamong1/bigbird-tapt-ep3",
3
+ "architectures": [
4
+ "EncoderDecoderModel"
5
+ ],
6
+ "decoder": {
7
+ "_name_or_path": "gogamza/kobart-base-v1",
8
+ "activation_dropout": 0.0,
9
+ "activation_function": "gelu",
10
+ "add_bias_logits": false,
11
+ "add_cross_attention": true,
12
+ "add_final_layer_norm": false,
13
+ "architectures": [
14
+ "BartModel"
15
+ ],
16
+ "attention_dropout": 0.0,
17
+ "author": "Heewon Jeon(madjakarta@gmail.com)",
18
+ "bad_words_ids": null,
19
+ "bos_token_id": 0,
20
+ "chunk_size_feed_forward": 0,
21
+ "classif_dropout": 0.1,
22
+ "classifier_dropout": 0.1,
23
+ "d_model": 768,
24
+ "decoder_attention_heads": 16,
25
+ "decoder_ffn_dim": 3072,
26
+ "decoder_layerdrop": 0.0,
27
+ "decoder_layers": 6,
28
+ "decoder_start_token_id": 1,
29
+ "diversity_penalty": 0.0,
30
+ "do_blenderbot_90_layernorm": false,
31
+ "do_sample": false,
32
+ "doc_type_size": 4,
33
+ "dropout": 0.1,
34
+ "early_stopping": false,
35
+ "encoder_attention_heads": 16,
36
+ "encoder_ffn_dim": 3072,
37
+ "encoder_layerdrop": 0.0,
38
+ "encoder_layers": 6,
39
+ "encoder_no_repeat_ngram_size": 0,
40
+ "eos_token_id": 1,
41
+ "extra_pos_embeddings": 2,
42
+ "finetuning_task": null,
43
+ "force_bos_token_to_be_generated": false,
44
+ "forced_bos_token_id": null,
45
+ "forced_eos_token_id": 1,
46
+ "gradient_checkpointing": false,
47
+ "id2label": {
48
+ "0": "NEGATIVE",
49
+ "1": "POSITIVE"
50
+ },
51
+ "init_std": 0.02,
52
+ "is_decoder": true,
53
+ "is_encoder_decoder": true,
54
+ "kobart_version": 1.0,
55
+ "label2id": {
56
+ "NEGATIVE": 0,
57
+ "POSITIVE": 1
58
+ },
59
+ "length_penalty": 1.0,
60
+ "max_length": 20,
61
+ "max_position_embeddings": 2048,
62
+ "min_length": 0,
63
+ "model_type": "bart",
64
+ "no_repeat_ngram_size": 0,
65
+ "normalize_before": false,
66
+ "normalize_embedding": true,
67
+ "num_beam_groups": 1,
68
+ "num_beams": 1,
69
+ "num_hidden_layers": 6,
70
+ "num_return_sequences": 1,
71
+ "output_attentions": false,
72
+ "output_hidden_states": false,
73
+ "output_scores": false,
74
+ "pad_token_id": 0,
75
+ "prefix": null,
76
+ "problem_type": null,
77
+ "pruned_heads": {},
78
+ "remove_invalid_values": false,
79
+ "repetition_penalty": 1.0,
80
+ "return_dict": true,
81
+ "return_dict_in_generate": false,
82
+ "scale_embedding": false,
83
+ "sep_token_id": null,
84
+ "static_position_embeddings": false,
85
+ "task_specific_params": null,
86
+ "temperature": 1.0,
87
+ "tie_encoder_decoder": false,
88
+ "tie_word_embeddings": true,
89
+ "tokenizer_class": "PreTrainedTokenizerFast",
90
+ "top_k": 50,
91
+ "top_p": 1.0,
92
+ "torch_dtype": null,
93
+ "torchscript": false,
94
+ "transformers_version": "4.11.0",
95
+ "use_bfloat16": false,
96
+ "use_cache": true,
97
+ "vocab_size": 32500
98
+ },
99
+ "encoder": {
100
+ "_name_or_path": "monologg/kobigbird-bert-base",
101
+ "add_cross_attention": false,
102
+ "architectures": [
103
+ "BigBirdForMaskedLM"
104
+ ],
105
+ "attention_probs_dropout_prob": 0.1,
106
+ "attention_type": "block_sparse",
107
+ "bad_words_ids": null,
108
+ "block_size": 64,
109
+ "bos_token_id": 5,
110
+ "chunk_size_feed_forward": 0,
111
+ "classifier_dropout": null,
112
+ "decoder_start_token_id": null,
113
+ "diversity_penalty": 0.0,
114
+ "do_sample": false,
115
+ "doc_type_size": 4,
116
+ "early_stopping": false,
117
+ "encoder_no_repeat_ngram_size": 0,
118
+ "eos_token_id": 6,
119
+ "finetuning_task": null,
120
+ "forced_bos_token_id": null,
121
+ "forced_eos_token_id": null,
122
+ "gradient_checkpointing": false,
123
+ "hidden_act": "gelu_new",
124
+ "hidden_dropout_prob": 0.1,
125
+ "hidden_size": 768,
126
+ "id2label": {
127
+ "0": "LABEL_0",
128
+ "1": "LABEL_1"
129
+ },
130
+ "initializer_range": 0.02,
131
+ "intermediate_size": 3072,
132
+ "is_decoder": false,
133
+ "is_encoder_decoder": false,
134
+ "label2id": {
135
+ "LABEL_0": 0,
136
+ "LABEL_1": 1
137
+ },
138
+ "layer_norm_eps": 1e-12,
139
+ "length_penalty": 1.0,
140
+ "max_length": 20,
141
+ "max_position_embeddings": 4096,
142
+ "min_length": 0,
143
+ "model_type": "big_bird",
144
+ "no_repeat_ngram_size": 0,
145
+ "num_attention_heads": 12,
146
+ "num_beam_groups": 1,
147
+ "num_beams": 1,
148
+ "num_hidden_layers": 6,
149
+ "num_random_blocks": 3,
150
+ "num_return_sequences": 1,
151
+ "output_attentions": false,
152
+ "output_hidden_states": false,
153
+ "output_scores": false,
154
+ "pad_token_id": 0,
155
+ "position_embedding_type": "absolute",
156
+ "prefix": null,
157
+ "problem_type": null,
158
+ "pruned_heads": {},
159
+ "remove_invalid_values": false,
160
+ "repetition_penalty": 1.0,
161
+ "rescale_embeddings": false,
162
+ "return_dict": true,
163
+ "return_dict_in_generate": false,
164
+ "sep_token_id": 3,
165
+ "task_specific_params": null,
166
+ "temperature": 1.0,
167
+ "tie_encoder_decoder": false,
168
+ "tie_word_embeddings": true,
169
+ "tokenizer_class": "BertTokenizer",
170
+ "top_k": 50,
171
+ "top_p": 1.0,
172
+ "torch_dtype": "float32",
173
+ "torchscript": false,
174
+ "transformers_version": "4.11.0",
175
+ "type_vocab_size": 2,
176
+ "use_bfloat16": false,
177
+ "use_bias": true,
178
+ "use_cache": true,
179
+ "vocab_size": 32500
180
+ },
181
+ "is_encoder_decoder": true,
182
+ "model_type": "encoder-decoder",
183
+ "torch_dtype": "float32",
184
+ "transformers_version": null
185
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66a5fc7cfec99b85e7d781abc6cb38ebcab0e3ccd22a90b10adf082fcacb8354
3
+ size 718021289
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": false, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "do_basic_tokenize": true, "never_split": null, "bos_token": "<s>", "eos_token": "</s>", "model_max_length": 4096, "special_tokens_map_file": "/opt/ml/.cache/huggingface/transformers/9bea998b48658e35dd618115a266f6c173183a9a4261fc6e40730d74c4b67899.e3640e465e51ce85d94923a0b396029ecc2e3e4c7764031eee57ab272637652d", "name_or_path": "metamong1/bigbird-tapt-ep3", "tokenizer_class": "BertTokenizer"}
trainer_state.json ADDED
@@ -0,0 +1,370 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 41.2618,
3
+ "best_model_checkpoint": "checkpoint/kobigbirdbart_tapt_ep3_bs16_pre_noam_LB/checkpoint-34340",
4
+ "epoch": 2.9999708802888674,
5
+ "global_step": 51510,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.06,
12
+ "learning_rate": 0.0001,
13
+ "loss": 4.0417,
14
+ "step": 1000
15
+ },
16
+ {
17
+ "epoch": 0.12,
18
+ "learning_rate": 7.072834474681765e-05,
19
+ "loss": 3.6688,
20
+ "step": 2000
21
+ },
22
+ {
23
+ "epoch": 0.17,
24
+ "learning_rate": 5.775426231077362e-05,
25
+ "loss": 3.5243,
26
+ "step": 3000
27
+ },
28
+ {
29
+ "epoch": 0.23,
30
+ "learning_rate": 5.001874180112048e-05,
31
+ "loss": 3.4178,
32
+ "step": 4000
33
+ },
34
+ {
35
+ "epoch": 0.29,
36
+ "learning_rate": 4.473924094197383e-05,
37
+ "loss": 3.367,
38
+ "step": 5000
39
+ },
40
+ {
41
+ "epoch": 0.35,
42
+ "learning_rate": 4.084183301607125e-05,
43
+ "loss": 3.3013,
44
+ "step": 6000
45
+ },
46
+ {
47
+ "epoch": 0.41,
48
+ "learning_rate": 3.7812639995975144e-05,
49
+ "loss": 3.2593,
50
+ "step": 7000
51
+ },
52
+ {
53
+ "epoch": 0.47,
54
+ "learning_rate": 3.537080170562066e-05,
55
+ "loss": 3.212,
56
+ "step": 8000
57
+ },
58
+ {
59
+ "epoch": 0.52,
60
+ "learning_rate": 3.3348143212252787e-05,
61
+ "loss": 3.1631,
62
+ "step": 9000
63
+ },
64
+ {
65
+ "epoch": 0.58,
66
+ "learning_rate": 3.16370022285456e-05,
67
+ "loss": 3.1368,
68
+ "step": 10000
69
+ },
70
+ {
71
+ "epoch": 0.64,
72
+ "learning_rate": 3.0164835160286395e-05,
73
+ "loss": 3.106,
74
+ "step": 11000
75
+ },
76
+ {
77
+ "epoch": 0.7,
78
+ "learning_rate": 2.8880740270466042e-05,
79
+ "loss": 3.0851,
80
+ "step": 12000
81
+ },
82
+ {
83
+ "epoch": 0.76,
84
+ "learning_rate": 2.774780664821333e-05,
85
+ "loss": 3.0639,
86
+ "step": 13000
87
+ },
88
+ {
89
+ "epoch": 0.82,
90
+ "learning_rate": 2.6738528983830193e-05,
91
+ "loss": 3.0412,
92
+ "step": 14000
93
+ },
94
+ {
95
+ "epoch": 0.87,
96
+ "learning_rate": 2.583193464319257e-05,
97
+ "loss": 3.0153,
98
+ "step": 15000
99
+ },
100
+ {
101
+ "epoch": 0.93,
102
+ "learning_rate": 2.5011715272671623e-05,
103
+ "loss": 2.9905,
104
+ "step": 16000
105
+ },
106
+ {
107
+ "epoch": 0.99,
108
+ "learning_rate": 2.426497258953522e-05,
109
+ "loss": 2.9887,
110
+ "step": 17000
111
+ },
112
+ {
113
+ "epoch": 1.0,
114
+ "eval_gen_len": 20.0,
115
+ "eval_loss": 3.2666966915130615,
116
+ "eval_rouge1": 43.6697,
117
+ "eval_rouge2": 28.3814,
118
+ "eval_rougeL": 40.2632,
119
+ "eval_rougeLsum": 40.2565,
120
+ "eval_runtime": 2924.633,
121
+ "eval_samples_per_second": 31.314,
122
+ "eval_steps_per_second": 1.957,
123
+ "step": 17170
124
+ },
125
+ {
126
+ "epoch": 1.05,
127
+ "learning_rate": 2.3581353179274502e-05,
128
+ "loss": 2.7893,
129
+ "step": 18000
130
+ },
131
+ {
132
+ "epoch": 1.11,
133
+ "learning_rate": 2.2952437303925296e-05,
134
+ "loss": 2.724,
135
+ "step": 19000
136
+ },
137
+ {
138
+ "epoch": 1.16,
139
+ "learning_rate": 2.2371298045736806e-05,
140
+ "loss": 2.722,
141
+ "step": 20000
142
+ },
143
+ {
144
+ "epoch": 1.22,
145
+ "learning_rate": 2.183217738419129e-05,
146
+ "loss": 2.7157,
147
+ "step": 21000
148
+ },
149
+ {
150
+ "epoch": 1.28,
151
+ "learning_rate": 2.1330244234932196e-05,
152
+ "loss": 2.7097,
153
+ "step": 22000
154
+ },
155
+ {
156
+ "epoch": 1.34,
157
+ "learning_rate": 2.0861411017276734e-05,
158
+ "loss": 2.6939,
159
+ "step": 23000
160
+ },
161
+ {
162
+ "epoch": 1.4,
163
+ "learning_rate": 2.0422192722261335e-05,
164
+ "loss": 2.7026,
165
+ "step": 24000
166
+ },
167
+ {
168
+ "epoch": 1.46,
169
+ "learning_rate": 2.0009597313304793e-05,
170
+ "loss": 2.6871,
171
+ "step": 25000
172
+ },
173
+ {
174
+ "epoch": 1.51,
175
+ "learning_rate": 1.9621039546293084e-05,
176
+ "loss": 2.6776,
177
+ "step": 26000
178
+ },
179
+ {
180
+ "epoch": 1.57,
181
+ "learning_rate": 1.925427251574924e-05,
182
+ "loss": 2.6799,
183
+ "step": 27000
184
+ },
185
+ {
186
+ "epoch": 1.63,
187
+ "learning_rate": 1.8907332773261164e-05,
188
+ "loss": 2.6613,
189
+ "step": 28000
190
+ },
191
+ {
192
+ "epoch": 1.69,
193
+ "learning_rate": 1.8578495948447844e-05,
194
+ "loss": 2.6756,
195
+ "step": 29000
196
+ },
197
+ {
198
+ "epoch": 1.75,
199
+ "learning_rate": 1.8266240576954328e-05,
200
+ "loss": 2.6459,
201
+ "step": 30000
202
+ },
203
+ {
204
+ "epoch": 1.81,
205
+ "learning_rate": 1.796921840006843e-05,
206
+ "loss": 2.6352,
207
+ "step": 31000
208
+ },
209
+ {
210
+ "epoch": 1.86,
211
+ "learning_rate": 1.768622981064213e-05,
212
+ "loss": 2.6379,
213
+ "step": 32000
214
+ },
215
+ {
216
+ "epoch": 1.92,
217
+ "learning_rate": 1.7416203423613194e-05,
218
+ "loss": 2.6138,
219
+ "step": 33000
220
+ },
221
+ {
222
+ "epoch": 1.98,
223
+ "learning_rate": 1.7158178976540783e-05,
224
+ "loss": 2.6193,
225
+ "step": 34000
226
+ },
227
+ {
228
+ "epoch": 2.0,
229
+ "eval_gen_len": 20.0,
230
+ "eval_loss": 3.2624921798706055,
231
+ "eval_rouge1": 44.7484,
232
+ "eval_rouge2": 29.1819,
233
+ "eval_rougeL": 41.2553,
234
+ "eval_rougeLsum": 41.2618,
235
+ "eval_runtime": 2925.4113,
236
+ "eval_samples_per_second": 31.306,
237
+ "eval_steps_per_second": 1.957,
238
+ "step": 34340
239
+ },
240
+ {
241
+ "epoch": 2.04,
242
+ "learning_rate": 1.6911292937114278e-05,
243
+ "loss": 2.5111,
244
+ "step": 35000
245
+ },
246
+ {
247
+ "epoch": 2.1,
248
+ "learning_rate": 1.6674766325339214e-05,
249
+ "loss": 2.4361,
250
+ "step": 36000
251
+ },
252
+ {
253
+ "epoch": 2.15,
254
+ "learning_rate": 1.644789435859962e-05,
255
+ "loss": 2.4523,
256
+ "step": 37000
257
+ },
258
+ {
259
+ "epoch": 2.21,
260
+ "learning_rate": 1.6230037605656824e-05,
261
+ "loss": 2.4485,
262
+ "step": 38000
263
+ },
264
+ {
265
+ "epoch": 2.27,
266
+ "learning_rate": 1.6020614396421562e-05,
267
+ "loss": 2.443,
268
+ "step": 39000
269
+ },
270
+ {
271
+ "epoch": 2.33,
272
+ "learning_rate": 1.58190942821137e-05,
273
+ "loss": 2.4408,
274
+ "step": 40000
275
+ },
276
+ {
277
+ "epoch": 2.39,
278
+ "learning_rate": 1.562499237823282e-05,
279
+ "loss": 2.4376,
280
+ "step": 41000
281
+ },
282
+ {
283
+ "epoch": 2.45,
284
+ "learning_rate": 1.5437864452872526e-05,
285
+ "loss": 2.4237,
286
+ "step": 42000
287
+ },
288
+ {
289
+ "epoch": 2.5,
290
+ "learning_rate": 1.5257302647033036e-05,
291
+ "loss": 2.4302,
292
+ "step": 43000
293
+ },
294
+ {
295
+ "epoch": 2.56,
296
+ "learning_rate": 1.508293173302068e-05,
297
+ "loss": 2.4272,
298
+ "step": 44000
299
+ },
300
+ {
301
+ "epoch": 2.62,
302
+ "learning_rate": 1.4914405832764997e-05,
303
+ "loss": 2.4191,
304
+ "step": 45000
305
+ },
306
+ {
307
+ "epoch": 2.68,
308
+ "learning_rate": 1.4751405530700888e-05,
309
+ "loss": 2.415,
310
+ "step": 46000
311
+ },
312
+ {
313
+ "epoch": 2.74,
314
+ "learning_rate": 1.4593635326349197e-05,
315
+ "loss": 2.4139,
316
+ "step": 47000
317
+ },
318
+ {
319
+ "epoch": 2.8,
320
+ "learning_rate": 1.4440821380348212e-05,
321
+ "loss": 2.409,
322
+ "step": 48000
323
+ },
324
+ {
325
+ "epoch": 2.85,
326
+ "learning_rate": 1.4292709514804612e-05,
327
+ "loss": 2.4043,
328
+ "step": 49000
329
+ },
330
+ {
331
+ "epoch": 2.91,
332
+ "learning_rate": 1.4149063434732295e-05,
333
+ "loss": 2.3984,
334
+ "step": 50000
335
+ },
336
+ {
337
+ "epoch": 2.97,
338
+ "learning_rate": 1.4009663142259388e-05,
339
+ "loss": 2.4011,
340
+ "step": 51000
341
+ },
342
+ {
343
+ "epoch": 3.0,
344
+ "eval_gen_len": 20.0,
345
+ "eval_loss": 3.302915573120117,
346
+ "eval_rouge1": 44.3235,
347
+ "eval_rouge2": 28.8361,
348
+ "eval_rougeL": 40.7694,
349
+ "eval_rougeLsum": 40.7674,
350
+ "eval_runtime": 2928.7847,
351
+ "eval_samples_per_second": 31.27,
352
+ "eval_steps_per_second": 1.954,
353
+ "step": 51510
354
+ },
355
+ {
356
+ "epoch": 3.0,
357
+ "step": 51510,
358
+ "total_flos": 3.55488261007809e+17,
359
+ "train_loss": 2.7859744007345357,
360
+ "train_runtime": 40305.1206,
361
+ "train_samples_per_second": 20.449,
362
+ "train_steps_per_second": 1.278
363
+ }
364
+ ],
365
+ "max_steps": 51510,
366
+ "num_train_epochs": 3,
367
+ "total_flos": 3.55488261007809e+17,
368
+ "trial_name": null,
369
+ "trial_params": null
370
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff