researchaccount commited on
Commit
24ebd86
1 Parent(s): 3a50343

Added files

Browse files
config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "UBC-NLP/MARBERT",
3
+ "architectures": [
4
+ "BertForMaskedLM"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "directionality": "bidi",
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 0,
20
+ "pooler_fc_size": 768,
21
+ "pooler_num_attention_heads": 12,
22
+ "pooler_num_fc_layers": 3,
23
+ "pooler_size_per_head": 128,
24
+ "pooler_type": "first_token_transform",
25
+ "position_embedding_type": "absolute",
26
+ "transformers_version": "4.6.0.dev0",
27
+ "type_vocab_size": 2,
28
+ "use_cache": true,
29
+ "vocab_size": 100000
30
+ }
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8667c6d1cce7439c7e5617729c973d341c0787cd72911d1e462263bda83e1eb3
3
+ size 1303669859
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8043d989d1925377fa06cef574d7e0d7ea83a8f753527412f8ea2284b7dda83
3
+ size 651861714
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:168f2977615e20afb1beaa2d89e12cdcb41d6587b8985716c2123b1516b7f707
3
+ size 623
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "do_basic_tokenize": true, "never_split": null, "special_tokens_map_file": null, "name_or_path": "UBC-NLP/MARBERT"}
trainer_state.json ADDED
@@ -0,0 +1,360 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 20.0,
5
+ "global_step": 17200,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.58,
12
+ "learning_rate": 4.854651162790698e-05,
13
+ "loss": 3.572,
14
+ "step": 500
15
+ },
16
+ {
17
+ "epoch": 1.0,
18
+ "eval_loss": 3.3753416538238525,
19
+ "eval_runtime": 146.7285,
20
+ "eval_samples_per_second": 163.69,
21
+ "step": 860
22
+ },
23
+ {
24
+ "epoch": 1.16,
25
+ "learning_rate": 4.709302325581396e-05,
26
+ "loss": 3.4691,
27
+ "step": 1000
28
+ },
29
+ {
30
+ "epoch": 1.74,
31
+ "learning_rate": 4.563953488372093e-05,
32
+ "loss": 3.2751,
33
+ "step": 1500
34
+ },
35
+ {
36
+ "epoch": 2.0,
37
+ "eval_loss": 3.3520445823669434,
38
+ "eval_runtime": 147.0097,
39
+ "eval_samples_per_second": 163.377,
40
+ "step": 1720
41
+ },
42
+ {
43
+ "epoch": 2.33,
44
+ "learning_rate": 4.418604651162791e-05,
45
+ "loss": 3.1479,
46
+ "step": 2000
47
+ },
48
+ {
49
+ "epoch": 2.91,
50
+ "learning_rate": 4.2732558139534885e-05,
51
+ "loss": 3.0201,
52
+ "step": 2500
53
+ },
54
+ {
55
+ "epoch": 3.0,
56
+ "eval_loss": 3.3326475620269775,
57
+ "eval_runtime": 146.9025,
58
+ "eval_samples_per_second": 163.496,
59
+ "step": 2580
60
+ },
61
+ {
62
+ "epoch": 3.49,
63
+ "learning_rate": 4.127906976744187e-05,
64
+ "loss": 2.8872,
65
+ "step": 3000
66
+ },
67
+ {
68
+ "epoch": 4.0,
69
+ "eval_loss": 3.355100154876709,
70
+ "eval_runtime": 146.8556,
71
+ "eval_samples_per_second": 163.548,
72
+ "step": 3440
73
+ },
74
+ {
75
+ "epoch": 4.07,
76
+ "learning_rate": 3.9825581395348835e-05,
77
+ "loss": 2.8373,
78
+ "step": 3500
79
+ },
80
+ {
81
+ "epoch": 4.65,
82
+ "learning_rate": 3.837209302325582e-05,
83
+ "loss": 2.6775,
84
+ "step": 4000
85
+ },
86
+ {
87
+ "epoch": 5.0,
88
+ "eval_loss": 3.35534405708313,
89
+ "eval_runtime": 146.787,
90
+ "eval_samples_per_second": 163.625,
91
+ "step": 4300
92
+ },
93
+ {
94
+ "epoch": 5.23,
95
+ "learning_rate": 3.691860465116279e-05,
96
+ "loss": 2.6133,
97
+ "step": 4500
98
+ },
99
+ {
100
+ "epoch": 5.81,
101
+ "learning_rate": 3.5465116279069774e-05,
102
+ "loss": 2.5406,
103
+ "step": 5000
104
+ },
105
+ {
106
+ "epoch": 6.0,
107
+ "eval_loss": 3.336427927017212,
108
+ "eval_runtime": 146.5342,
109
+ "eval_samples_per_second": 163.907,
110
+ "step": 5160
111
+ },
112
+ {
113
+ "epoch": 6.4,
114
+ "learning_rate": 3.401162790697674e-05,
115
+ "loss": 2.4318,
116
+ "step": 5500
117
+ },
118
+ {
119
+ "epoch": 6.98,
120
+ "learning_rate": 3.2558139534883724e-05,
121
+ "loss": 2.4171,
122
+ "step": 6000
123
+ },
124
+ {
125
+ "epoch": 7.0,
126
+ "eval_loss": 3.332899808883667,
127
+ "eval_runtime": 146.7159,
128
+ "eval_samples_per_second": 163.704,
129
+ "step": 6020
130
+ },
131
+ {
132
+ "epoch": 7.56,
133
+ "learning_rate": 3.11046511627907e-05,
134
+ "loss": 2.267,
135
+ "step": 6500
136
+ },
137
+ {
138
+ "epoch": 8.0,
139
+ "eval_loss": 3.3342535495758057,
140
+ "eval_runtime": 146.2923,
141
+ "eval_samples_per_second": 164.178,
142
+ "step": 6880
143
+ },
144
+ {
145
+ "epoch": 8.14,
146
+ "learning_rate": 2.9651162790697678e-05,
147
+ "loss": 2.232,
148
+ "step": 7000
149
+ },
150
+ {
151
+ "epoch": 8.72,
152
+ "learning_rate": 2.8197674418604653e-05,
153
+ "loss": 2.1545,
154
+ "step": 7500
155
+ },
156
+ {
157
+ "epoch": 9.0,
158
+ "eval_loss": 3.3147542476654053,
159
+ "eval_runtime": 147.8917,
160
+ "eval_samples_per_second": 162.403,
161
+ "step": 7740
162
+ },
163
+ {
164
+ "epoch": 9.3,
165
+ "learning_rate": 2.674418604651163e-05,
166
+ "loss": 2.1127,
167
+ "step": 8000
168
+ },
169
+ {
170
+ "epoch": 9.88,
171
+ "learning_rate": 2.5290697674418607e-05,
172
+ "loss": 2.064,
173
+ "step": 8500
174
+ },
175
+ {
176
+ "epoch": 10.0,
177
+ "eval_loss": 3.3158812522888184,
178
+ "eval_runtime": 147.2862,
179
+ "eval_samples_per_second": 163.07,
180
+ "step": 8600
181
+ },
182
+ {
183
+ "epoch": 10.47,
184
+ "learning_rate": 2.3837209302325582e-05,
185
+ "loss": 1.9663,
186
+ "step": 9000
187
+ },
188
+ {
189
+ "epoch": 11.0,
190
+ "eval_loss": 3.3465089797973633,
191
+ "eval_runtime": 148.1203,
192
+ "eval_samples_per_second": 162.152,
193
+ "step": 9460
194
+ },
195
+ {
196
+ "epoch": 11.05,
197
+ "learning_rate": 2.238372093023256e-05,
198
+ "loss": 1.9353,
199
+ "step": 9500
200
+ },
201
+ {
202
+ "epoch": 11.63,
203
+ "learning_rate": 2.0930232558139536e-05,
204
+ "loss": 1.8572,
205
+ "step": 10000
206
+ },
207
+ {
208
+ "epoch": 12.0,
209
+ "eval_loss": 3.2874529361724854,
210
+ "eval_runtime": 147.3935,
211
+ "eval_samples_per_second": 162.952,
212
+ "step": 10320
213
+ },
214
+ {
215
+ "epoch": 12.21,
216
+ "learning_rate": 1.9476744186046514e-05,
217
+ "loss": 1.8373,
218
+ "step": 10500
219
+ },
220
+ {
221
+ "epoch": 12.79,
222
+ "learning_rate": 1.802325581395349e-05,
223
+ "loss": 1.7866,
224
+ "step": 11000
225
+ },
226
+ {
227
+ "epoch": 13.0,
228
+ "eval_loss": 3.321674108505249,
229
+ "eval_runtime": 146.9637,
230
+ "eval_samples_per_second": 163.428,
231
+ "step": 11180
232
+ },
233
+ {
234
+ "epoch": 13.37,
235
+ "learning_rate": 1.6569767441860464e-05,
236
+ "loss": 1.7253,
237
+ "step": 11500
238
+ },
239
+ {
240
+ "epoch": 13.95,
241
+ "learning_rate": 1.5116279069767441e-05,
242
+ "loss": 1.719,
243
+ "step": 12000
244
+ },
245
+ {
246
+ "epoch": 14.0,
247
+ "eval_loss": 3.2760589122772217,
248
+ "eval_runtime": 145.6136,
249
+ "eval_samples_per_second": 164.943,
250
+ "step": 12040
251
+ },
252
+ {
253
+ "epoch": 14.53,
254
+ "learning_rate": 1.3662790697674418e-05,
255
+ "loss": 1.6293,
256
+ "step": 12500
257
+ },
258
+ {
259
+ "epoch": 15.0,
260
+ "eval_loss": 3.302276611328125,
261
+ "eval_runtime": 146.2216,
262
+ "eval_samples_per_second": 164.258,
263
+ "step": 12900
264
+ },
265
+ {
266
+ "epoch": 15.12,
267
+ "learning_rate": 1.2209302325581395e-05,
268
+ "loss": 1.6339,
269
+ "step": 13000
270
+ },
271
+ {
272
+ "epoch": 15.7,
273
+ "learning_rate": 1.0755813953488372e-05,
274
+ "loss": 1.5656,
275
+ "step": 13500
276
+ },
277
+ {
278
+ "epoch": 16.0,
279
+ "eval_loss": 3.2689430713653564,
280
+ "eval_runtime": 145.5127,
281
+ "eval_samples_per_second": 165.058,
282
+ "step": 13760
283
+ },
284
+ {
285
+ "epoch": 16.28,
286
+ "learning_rate": 9.302325581395349e-06,
287
+ "loss": 1.5503,
288
+ "step": 14000
289
+ },
290
+ {
291
+ "epoch": 16.86,
292
+ "learning_rate": 7.848837209302325e-06,
293
+ "loss": 1.5344,
294
+ "step": 14500
295
+ },
296
+ {
297
+ "epoch": 17.0,
298
+ "eval_loss": 3.237142562866211,
299
+ "eval_runtime": 146.4021,
300
+ "eval_samples_per_second": 164.055,
301
+ "step": 14620
302
+ },
303
+ {
304
+ "epoch": 17.44,
305
+ "learning_rate": 6.395348837209303e-06,
306
+ "loss": 1.4974,
307
+ "step": 15000
308
+ },
309
+ {
310
+ "epoch": 18.0,
311
+ "eval_loss": 3.209892749786377,
312
+ "eval_runtime": 147.2158,
313
+ "eval_samples_per_second": 163.148,
314
+ "step": 15480
315
+ },
316
+ {
317
+ "epoch": 18.02,
318
+ "learning_rate": 4.941860465116279e-06,
319
+ "loss": 1.4739,
320
+ "step": 15500
321
+ },
322
+ {
323
+ "epoch": 18.6,
324
+ "learning_rate": 3.488372093023256e-06,
325
+ "loss": 1.4574,
326
+ "step": 16000
327
+ },
328
+ {
329
+ "epoch": 19.0,
330
+ "eval_loss": 3.2351009845733643,
331
+ "eval_runtime": 146.7384,
332
+ "eval_samples_per_second": 163.679,
333
+ "step": 16340
334
+ },
335
+ {
336
+ "epoch": 19.19,
337
+ "learning_rate": 2.0348837209302328e-06,
338
+ "loss": 1.4283,
339
+ "step": 16500
340
+ },
341
+ {
342
+ "epoch": 19.77,
343
+ "learning_rate": 5.813953488372093e-07,
344
+ "loss": 1.4204,
345
+ "step": 17000
346
+ },
347
+ {
348
+ "epoch": 20.0,
349
+ "eval_loss": 3.2112162113189697,
350
+ "eval_runtime": 147.2288,
351
+ "eval_samples_per_second": 163.134,
352
+ "step": 17200
353
+ }
354
+ ],
355
+ "max_steps": 17200,
356
+ "num_train_epochs": 20,
357
+ "total_flos": 6.8827072512e+16,
358
+ "trial_name": null,
359
+ "trial_params": null
360
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2228a7d58ab4a59c8b8432e8dee6d0f33970fcaf9a8320173aff80a4368584d3
3
+ size 2351
vocab.txt ADDED
The diff for this file is too large to render. See raw diff